1 |
#' View or Establish Custom Norming Contexts |
|
2 |
#' |
|
3 |
#' Custom norming contexts can be used to process later texts by specifying the |
|
4 |
#' \code{custom_context} API argument in the \code{receptiviti} function (e.g., |
|
5 |
#' \code{receptiviti("text to score", version = "v2", |
|
6 |
#' options = list(custom_context = "norm_name"))}, |
|
7 |
#' where \code{norm_name} is the name you set here). |
|
8 |
#' |
|
9 |
#' @param name Name of a new norming context, to be established from the provided \code{text}. |
|
10 |
#' Not providing a name will list the previously created contexts. |
|
11 |
#' @param text Text to be processed and used as the custom norming context. |
|
12 |
#' Not providing text will return the status of the named norming context. |
|
13 |
#' @param options Options to set for the norming context (e.g., |
|
14 |
#' \code{list(min_word_count = 350,} \code{max_punctuation = .25)}). |
|
15 |
#' @param delete Logical; If \code{TRUE}, will request to remove the \code{name} context. |
|
16 |
#' @param name_only Logical; If \code{TRUE}, will return a character vector of names |
|
17 |
#' only, including those of build-in contexts. |
|
18 |
#' @param id,text_column,id_column,files,dir,file_type,collapse_lines,encoding Additional |
|
19 |
#' arguments used to handle \code{text}; same as those in \code{\link{receptiviti}}. |
|
20 |
#' @param bundle_size,bundle_byte_limit,retry_limit,clear_scratch_cache,use_future,in_memory |
|
21 |
#' Additional arguments used to manage the requests; same as those in |
|
22 |
#' \code{\link{receptiviti}}. |
|
23 |
#' @param key,secret,url Request arguments; same as those in \code{\link{receptiviti}}. |
|
24 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
25 |
#' @returns Nothing if \code{delete} if \code{TRUE}. |
|
26 |
#' Otherwise, if \code{name} is not specified, a character vector containing names of each |
|
27 |
#' available norming context (built-in and custom). |
|
28 |
#' If \code{text} is not specified, the status of the |
|
29 |
#' named context in a \code{list}. If \code{text}s are provided, a \code{list}: |
|
30 |
#' \itemize{ |
|
31 |
#' \item \code{initial_status}: Initial status of the context. |
|
32 |
#' \item \code{first_pass}: Response after texts are sent the first time, or |
|
33 |
#' \code{NULL} if the initial status is \code{pass_two}. |
|
34 |
#' \item \code{second_pass}: Response after texts are sent the second time. |
|
35 |
#' } |
|
36 |
#' @examples |
|
37 |
#' \dontrun{ |
|
38 |
#' |
|
39 |
#' # get status of all existing custom norming contexts |
|
40 |
#' contexts <- receptiviti_norming(name_only = TRUE) |
|
41 |
#' |
|
42 |
#' # create or get the status of a single custom norming context |
|
43 |
#' status <- receptiviti_norming("new_context") |
|
44 |
#' |
|
45 |
#' # send texts to establish the context |
|
46 |
#' |
|
47 |
#' ## these texts can be specified just like |
|
48 |
#' ## texts in the main receptiviti function |
|
49 |
#' |
|
50 |
#' ## such as directly |
|
51 |
#' full_status <- receptiviti_norming("new_context", c( |
|
52 |
#' "a text to set the norm", |
|
53 |
#' "another text part of the new context" |
|
54 |
#' )) |
|
55 |
#' |
|
56 |
#' ## or from a file |
|
57 |
#' full_status <- receptiviti_norming( |
|
58 |
#' "new_context", "./path/to/text.csv", |
|
59 |
#' text_column = "texts" |
|
60 |
#' ) |
|
61 |
#' |
|
62 |
#' ## or from multiple files in a directory |
|
63 |
#' full_status <- receptiviti_norming( |
|
64 |
#' "new_context", |
|
65 |
#' dir = "./path/to/txt_files" |
|
66 |
#' ) |
|
67 |
#' } |
|
68 |
#' @export |
|
69 | ||
70 |
receptiviti_norming <- function( |
|
71 |
name = NULL, |
|
72 |
text = NULL, |
|
73 |
options = list(), |
|
74 |
delete = FALSE, |
|
75 |
name_only = FALSE, |
|
76 |
id = NULL, |
|
77 |
text_column = NULL, |
|
78 |
id_column = NULL, |
|
79 |
files = NULL, |
|
80 |
dir = NULL, |
|
81 |
file_type = "txt", |
|
82 |
collapse_lines = FALSE, |
|
83 |
encoding = NULL, |
|
84 |
bundle_size = 1000, |
|
85 |
bundle_byte_limit = 75e5, |
|
86 |
retry_limit = 50, |
|
87 |
clear_scratch_cache = TRUE, |
|
88 |
use_future = FALSE, |
|
89 |
in_memory = TRUE, |
|
90 |
url = Sys.getenv("RECEPTIVITI_URL"), |
|
91 |
key = Sys.getenv("RECEPTIVITI_KEY"), |
|
92 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), |
|
93 |
verbose = TRUE |
|
94 |
) { |
|
95 | 12x |
params <- handle_request_params(url, key, secret) |
96 | 12x |
if (name_only) { |
97 | 5x |
req <- curl::curl_fetch_memory( |
98 | 5x |
paste0(params$url, "/v2/norming"), |
99 | 5x |
params$handler |
100 |
) |
|
101 | 5x |
if (req$status_code != 200) { |
102 | ! |
stop( |
103 | ! |
"failed to make norming list request: ", |
104 | ! |
req$status_code, |
105 | ! |
call. = FALSE |
106 |
) |
|
107 |
} |
|
108 | 5x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
109 | 5x |
if (verbose) { |
110 | 1x |
if (length(norms)) { |
111 | 1x |
message( |
112 | 1x |
"available norming context(s): ", |
113 | 1x |
paste(sub("custom/", "", norms, fixed = TRUE), collapse = ", ") |
114 |
) |
|
115 |
} else { |
|
116 | ! |
message("no custom norming contexts found") |
117 |
} |
|
118 |
} |
|
119 | 5x |
return(norms) |
120 |
} |
|
121 | ||
122 | 7x |
baseurl <- paste0(params$url, "/v2/norming/custom/") |
123 | 7x |
if (!is.null(name) && grepl("[^a-z0-9_.-]", name)) { |
124 | 1x |
stop( |
125 | 1x |
"`name` can only include lowercase letters, numbers, hyphens, underscores, or periods", |
126 | 1x |
call. = FALSE |
127 |
) |
|
128 |
} |
|
129 | ||
130 |
# list current contexts |
|
131 | 6x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
132 | 6x |
if (req$status_code != 200) { |
133 | ! |
stop( |
134 | ! |
"failed to make norming list request: ", |
135 | ! |
req$status_code, |
136 | ! |
call. = FALSE |
137 |
) |
|
138 |
} |
|
139 | 6x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
140 | 6x |
if (length(norms)) { |
141 | 6x |
if (verbose && is.null(name)) { |
142 | 1x |
message( |
143 | 1x |
"custom norming context(s) found: ", |
144 | 1x |
paste(sub("custom/", "", norms$name, fixed = TRUE), collapse = ", ") |
145 |
) |
|
146 |
} |
|
147 |
} else { |
|
148 | ! |
if (verbose && is.null(name)) message("no custom norming contexts found") |
149 | ! |
norms <- NULL |
150 |
} |
|
151 | 6x |
if (is.null(name)) { |
152 | 1x |
return(norms) |
153 |
} |
|
154 | ||
155 | 5x |
context_id <- paste0("custom/", name) |
156 | 5x |
if (context_id %in% norms$name) { |
157 | 4x |
if (delete) { |
158 | 1x |
curl::handle_setopt(params$handler, customrequest = "DELETE") |
159 | 1x |
req <- curl::curl_fetch_memory(paste0(baseurl, name), params$handler) |
160 | 1x |
if (req$status_code != 200) { |
161 | ! |
message <- list(error = rawToChar(req$content)) |
162 | ! |
if (substr(message$error, 1, 1) == "{") |
163 | ! |
message$error <- jsonlite::fromJSON(message$error) |
164 | ! |
stop( |
165 | ! |
"failed to delete custom norming context: ", |
166 | ! |
message$error, |
167 | ! |
call. = FALSE |
168 |
) |
|
169 |
} |
|
170 | 1x |
return(invisible(NULL)) |
171 |
} |
|
172 | 3x |
status <- as.list(norms[norms$name == context_id, ]) |
173 | 3x |
if (length(options)) { |
174 | ! |
warning( |
175 | ! |
"context ", |
176 | ! |
name, |
177 | ! |
" already exists, so options do not apply", |
178 | ! |
call. = FALSE |
179 |
) |
|
180 |
} |
|
181 | 1x |
} else if (!delete) { |
182 |
# establish a new context if needed |
|
183 | 1x |
if (verbose) message("requesting creation of custom context ", name) |
184 | 1x |
curl::handle_setopt( |
185 | 1x |
params$handler, |
186 | 1x |
copypostfields = jsonlite::toJSON( |
187 | 1x |
c(name = name, options), |
188 | 1x |
auto_unbox = TRUE |
189 |
) |
|
190 |
) |
|
191 | 1x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
192 | 1x |
if (req$status_code != 200) { |
193 | ! |
message <- list(error = rawToChar(req$content)) |
194 | ! |
if (substr(message$error, 1, 1) == "{") |
195 | ! |
message$error <- jsonlite::fromJSON(message$error) |
196 | ! |
stop( |
197 | ! |
"failed to make norming creation request: ", |
198 | ! |
message$error, |
199 | ! |
call. = FALSE |
200 |
) |
|
201 |
} |
|
202 | 1x |
status <- jsonlite::fromJSON(rawToChar(req$content)) |
203 | 1x |
for (option in names(options)) { |
204 | 1x |
if (!is.null(status[[option]]) && status[[option]] != options[[option]]) { |
205 | ! |
warning( |
206 | ! |
"set option ", |
207 | ! |
option, |
208 | ! |
" does not match the requested value", |
209 | ! |
call. = FALSE |
210 |
) |
|
211 |
} |
|
212 |
} |
|
213 |
} |
|
214 | 4x |
if (delete) { |
215 | ! |
message("context ", name, " does not exist") |
216 | ! |
return(invisible(NULL)) |
217 |
} |
|
218 | 4x |
if (verbose) { |
219 | 4x |
message( |
220 | 4x |
"status of ", |
221 | 4x |
name, |
222 |
": ", |
|
223 | 4x |
jsonlite::toJSON(status, pretty = TRUE, auto_unbox = TRUE) |
224 |
) |
|
225 |
} |
|
226 | 4x |
if (is.null(text)) { |
227 | 3x |
return(status) |
228 |
} |
|
229 | 1x |
if (status$status != "created") { |
230 | ! |
warning("status is not `created`, so cannot send text", call. = FALSE) |
231 | ! |
return(invisible(list( |
232 | ! |
initial_status = status, |
233 | ! |
first_pass = NULL, |
234 | ! |
second_pass = NULL |
235 |
))) |
|
236 |
} |
|
237 | 1x |
if (verbose) message("sending first-pass samples for ", name) |
238 | 1x |
first_pass <- manage_request( |
239 | 1x |
text, |
240 | 1x |
id = id, |
241 | 1x |
text_column = text_column, |
242 | 1x |
id_column = id_column, |
243 | 1x |
files = files, |
244 | 1x |
dir = dir, |
245 | 1x |
file_type = file_type, |
246 | 1x |
collapse_lines = collapse_lines, |
247 | 1x |
encoding = encoding, |
248 | 1x |
bundle_size = bundle_size, |
249 | 1x |
bundle_byte_limit = bundle_byte_limit, |
250 | 1x |
retry_limit = retry_limit, |
251 | 1x |
clear_scratch_cache = clear_scratch_cache, |
252 | 1x |
cores = 1, |
253 | 1x |
use_future = use_future, |
254 | 1x |
in_memory = in_memory, |
255 | 1x |
url = paste0(baseurl, name, "/one"), |
256 | 1x |
key = key, |
257 | 1x |
secret = secret, |
258 | 1x |
verbose = verbose, |
259 | 1x |
to_norming = TRUE |
260 | 1x |
)$final_res |
261 | 1x |
second_pass <- NULL |
262 |
if ( |
|
263 | 1x |
!is.null(first_pass$analyzed_samples) && |
264 | 1x |
all(first_pass$analyzed_samples == 0) |
265 |
) { |
|
266 | ! |
warning( |
267 | ! |
"no texts were successfully analyzed in the first pass, so second pass was skipped", |
268 | ! |
call. = FALSE |
269 |
) |
|
270 |
} else { |
|
271 | 1x |
if (verbose) message("sending second-pass samples for ", name) |
272 | 1x |
second_pass <- manage_request( |
273 | 1x |
text, |
274 | 1x |
id = id, |
275 | 1x |
text_column = text_column, |
276 | 1x |
id_column = id_column, |
277 | 1x |
files = files, |
278 | 1x |
dir = dir, |
279 | 1x |
file_type = file_type, |
280 | 1x |
collapse_lines = collapse_lines, |
281 | 1x |
encoding = encoding, |
282 | 1x |
bundle_size = bundle_size, |
283 | 1x |
bundle_byte_limit = bundle_byte_limit, |
284 | 1x |
retry_limit = retry_limit, |
285 | 1x |
clear_scratch_cache = clear_scratch_cache, |
286 | 1x |
cores = 1, |
287 | 1x |
use_future = use_future, |
288 | 1x |
in_memory = in_memory, |
289 | 1x |
url = paste0(baseurl, name, "/two"), |
290 | 1x |
key = key, |
291 | 1x |
secret = secret, |
292 | 1x |
verbose = verbose, |
293 | 1x |
to_norming = TRUE |
294 | 1x |
)$final_res |
295 |
} |
|
296 |
if ( |
|
297 | 1x |
!is.null(second_pass$analyzed_samples) && |
298 | 1x |
all(second_pass$analyzed_samples == 0) |
299 |
) { |
|
300 | ! |
warning( |
301 | ! |
"no texts were successfully analyzed in the second pass", |
302 | ! |
call. = FALSE |
303 |
) |
|
304 |
} |
|
305 | 1x |
invisible(list( |
306 | 1x |
initial_status = status, |
307 | 1x |
first_pass = first_pass, |
308 | 1x |
second_pass = second_pass |
309 |
)) |
|
310 |
} |
1 |
manage_request <- function( |
|
2 |
text = NULL, |
|
3 |
id = NULL, |
|
4 |
text_column = NULL, |
|
5 |
id_column = NULL, |
|
6 |
files = NULL, |
|
7 |
dir = NULL, |
|
8 |
file_type = "txt", |
|
9 |
encoding = NULL, |
|
10 |
context = "written", |
|
11 |
api_args = getOption("receptiviti.api_args", list()), |
|
12 |
bundle_size = 1000, |
|
13 |
bundle_byte_limit = 75e5, |
|
14 |
collapse_lines = FALSE, |
|
15 |
retry_limit = 50, |
|
16 |
clear_scratch_cache = TRUE, |
|
17 |
request_cache = TRUE, |
|
18 |
cores = detectCores() - 1, |
|
19 |
collect_results = TRUE, |
|
20 |
use_future = FALSE, |
|
21 |
in_memory = TRUE, |
|
22 |
verbose = FALSE, |
|
23 |
make_request = TRUE, |
|
24 |
text_as_paths = FALSE, |
|
25 |
cache = Sys.getenv("RECEPTIVITI_CACHE"), |
|
26 |
cache_overwrite = FALSE, |
|
27 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"), |
|
28 |
key = Sys.getenv("RECEPTIVITI_KEY"), |
|
29 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), |
|
30 |
url = Sys.getenv("RECEPTIVITI_URL"), |
|
31 |
version = Sys.getenv("RECEPTIVITI_VERSION"), |
|
32 |
endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT"), |
|
33 |
to_norming = FALSE |
|
34 |
) { |
|
35 |
# check input |
|
36 | 70x |
if (use_future && !requireNamespace("future.apply", quietly = TRUE)) { |
37 | ! |
stop("install the `future.apply` package to use future", call. = FALSE) |
38 |
} |
|
39 | 70x |
st <- proc.time()[[3]] |
40 | 70x |
text_as_dir <- FALSE |
41 | 70x |
if (is.null(text)) { |
42 | 3x |
if (!is.null(dir)) { |
43 | ! |
if (!dir.exists(dir)) stop("entered dir does not exist", call. = FALSE) |
44 | 1x |
text <- dir |
45 | 1x |
text_as_dir <- TRUE |
46 | 2x |
} else if (!is.null(files)) { |
47 | 1x |
text <- files |
48 | 1x |
text_as_paths <- TRUE |
49 |
} else { |
|
50 | 1x |
stop( |
51 | 1x |
"enter text as the first argument, or use files or dir", |
52 | 1x |
call. = FALSE |
53 |
) |
|
54 |
} |
|
55 |
} |
|
56 | 69x |
if (text_as_paths) { |
57 | 3x |
if (anyNA(text)) |
58 | 1x |
stop( |
59 | 1x |
"NAs are not allowed in text when being treated as file paths", |
60 | 1x |
call. = FALSE |
61 |
) |
|
62 | 2x |
if (!all(file.exists(text))) |
63 | 1x |
stop("not all of the files in text exist", call. = FALSE) |
64 |
} |
|
65 | 67x |
read_in <- FALSE |
66 | 67x |
handle_encoding <- function(file) { |
67 | 162x |
if (is.null(encoding)) { |
68 | 160x |
con <- gzfile(file, "rb") |
69 | 160x |
on.exit(close(con)) |
70 | 160x |
unlist(stringi::stri_enc_detect(readBin(con, "raw", file.size(file)))[[ |
71 | 160x |
1 |
72 | 160x |
]])[[1]] |
73 |
} else { |
|
74 | 2x |
encoding |
75 |
} |
|
76 |
} |
|
77 |
if ( |
|
78 | 67x |
text_as_dir || |
79 | 67x |
text_as_paths || |
80 | 67x |
(is.character(text) && !anyNA(text) && all(nchar(text) < 500)) |
81 |
) { |
|
82 | 50x |
if (text_as_dir || length(text) == 1 && dir.exists(text)) { |
83 | 4x |
if (verbose) |
84 | ! |
message( |
85 | ! |
"reading in texts from directory: ", |
86 | ! |
text, |
87 |
" (", |
|
88 | ! |
round(proc.time()[[3]] - st, 4), |
89 |
")" |
|
90 |
) |
|
91 | 4x |
text_as_paths <- TRUE |
92 | 4x |
text <- normalizePath( |
93 | 4x |
list.files(text, file_type, full.names = TRUE), |
94 |
"/", |
|
95 | 4x |
FALSE |
96 |
) |
|
97 |
} |
|
98 | 50x |
if (text_as_paths || all(file.exists(text))) { |
99 | 14x |
text_as_paths <- collapse_lines |
100 | 14x |
if (!collapse_lines) { |
101 | 12x |
if (verbose) |
102 | ! |
message( |
103 | ! |
"reading in texts from file list (", |
104 | ! |
round(proc.time()[[3]] - st, 4), |
105 |
")" |
|
106 |
) |
|
107 | 12x |
if (is.null(id_column)) |
108 | 11x |
names(text) <- if (length(id) != length(text)) text else id |
109 | 12x |
if (all(grepl("\\.csv", text, TRUE))) { |
110 | 6x |
if (is.null(text_column)) |
111 | 2x |
stop( |
112 | 2x |
"text appears to point to csv files, but text_column was not specified", |
113 | 2x |
call. = FALSE |
114 |
) |
|
115 | 4x |
read_in <- TRUE |
116 | 4x |
text <- unlist(lapply(text, function(f) { |
117 | 4x |
d <- tryCatch( |
118 |
{ |
|
119 | 4x |
enc <- handle_encoding(f) |
120 | 4x |
con <- gzfile(f, encoding = enc) |
121 | 4x |
arrow::read_csv_arrow( |
122 | 4x |
con, |
123 | 4x |
read_options = arrow::CsvReadOptions$create( |
124 | 4x |
encoding = enc |
125 |
), |
|
126 | 4x |
col_select = c(text_column, id_column) |
127 |
) |
|
128 |
}, |
|
129 | 4x |
error = function(e) NULL |
130 |
) |
|
131 | 1x |
if (is.null(d)) stop("failed to read in file ", f, call. = FALSE) |
132 | 3x |
if (!is.null(id_column) && id_column %in% colnames(d)) { |
133 | 1x |
structure( |
134 | 1x |
d[, text_column, drop = TRUE], |
135 | 1x |
names = d[, id_column, drop = TRUE] |
136 |
) |
|
137 |
} else { |
|
138 | 2x |
d[, text_column, drop = TRUE] |
139 |
} |
|
140 |
})) |
|
141 |
} else { |
|
142 | 6x |
text <- unlist(lapply(text, function(f) { |
143 | 106x |
tryCatch( |
144 |
{ |
|
145 | 106x |
con <- gzfile(f, encoding = handle_encoding(f)) |
146 | 106x |
on.exit(close(con)) |
147 | 106x |
d <- readLines(con, warn = FALSE, skipNul = TRUE) |
148 | 106x |
d[d != ""] |
149 |
}, |
|
150 | 106x |
error = function(e) |
151 | 106x |
stop("failed to read in file ", f, call. = FALSE) |
152 |
) |
|
153 |
})) |
|
154 |
} |
|
155 | 9x |
id <- names(text) |
156 |
} |
|
157 |
} else if ( |
|
158 | 36x |
length(text) == 1 && |
159 | 36x |
dirname(text) != "." && |
160 | 36x |
dir.exists(dirname(dirname(text))) |
161 |
) { |
|
162 | ! |
stop("text appears to be a directory, but it does not exist") |
163 |
} |
|
164 | 47x |
if (text_as_paths && is.null(id)) { |
165 | 2x |
id <- text |
166 | 2x |
if (anyDuplicated(id)) |
167 | ! |
id <- names(unlist(lapply( |
168 | ! |
split(id, factor(id, unique(id))), |
169 | ! |
seq_along |
170 |
))) |
|
171 |
} |
|
172 |
} |
|
173 | 64x |
if (is.null(dim(text))) { |
174 | 58x |
if (!read_in) { |
175 | 55x |
if (!text_as_paths && !is.null(text_column)) |
176 | 1x |
stop("text_column is specified, but text has no columns", call. = FALSE) |
177 | 54x |
if (!is.null(id_column)) |
178 | 1x |
stop("id_column is specified, but text has no columns", call. = FALSE) |
179 |
} |
|
180 |
} else { |
|
181 | 1x |
if (length(id) == 1 && id %in% colnames(text)) id_column <- id |
182 | 6x |
if (!is.null(id_column)) { |
183 | 2x |
if (id_column %in% colnames(text)) { |
184 | 1x |
id <- text[, id_column, drop = TRUE] |
185 |
} else { |
|
186 | 1x |
stop("id_column not found in text", call. = FALSE) |
187 |
} |
|
188 |
} |
|
189 | 5x |
if (!is.null(text_column)) { |
190 | 3x |
if (text_column %in% colnames(text)) { |
191 | 2x |
text <- text[, text_column, drop = TRUE] |
192 |
} else { |
|
193 | 1x |
if (!text_as_paths) stop("text_column not found in text", call. = FALSE) |
194 |
} |
|
195 |
} |
|
196 | 4x |
if (!is.null(dim(text))) { |
197 | 2x |
if (ncol(text) == 1) { |
198 | 1x |
text <- text[, 1, drop = TRUE] |
199 |
} else { |
|
200 | 1x |
stop("text has dimensions, but no text_column column", call. = FALSE) |
201 |
} |
|
202 |
} |
|
203 |
} |
|
204 | 2x |
if (!is.character(text)) text <- as.character(text) |
205 | 59x |
if (!length(text)) |
206 | 1x |
stop("no texts were found after resolving the text argument") |
207 | 4x |
if (length(id) && !is.character(id)) id <- as.character(id) |
208 | 58x |
provided_id <- FALSE |
209 | 58x |
if (length(id)) { |
210 | 15x |
if (length(id) != length(text)) |
211 | 1x |
stop("id is not the same length as text", call. = FALSE) |
212 | 1x |
if (anyDuplicated(id)) stop("id contains duplicate values", call. = FALSE) |
213 | 13x |
provided_id <- TRUE |
214 |
} else { |
|
215 | 43x |
id <- paste0("t", seq_along(text)) |
216 |
} |
|
217 | 1x |
if (!is.numeric(retry_limit)) retry_limit <- 0 |
218 | 56x |
if (to_norming) { |
219 | 2x |
version <- "v2" |
220 | 2x |
endpoint <- "norming" |
221 | 2x |
full_url <- url |
222 | 2x |
request_cache <- FALSE |
223 |
} else { |
|
224 | 54x |
url_parts <- unlist(strsplit( |
225 | 54x |
regmatches( |
226 | 54x |
url, |
227 | 54x |
gregexpr("/[Vv]\\d+(?:/[^/]+)?", url) |
228 | 54x |
)[[1]], |
229 |
"/", |
|
230 | 54x |
fixed = TRUE |
231 |
)) |
|
232 | 54x |
if (version == "") |
233 | 49x |
version <- if (length(url_parts) > 1) url_parts[[2]] else "v1" |
234 | 54x |
version <- tolower(version) |
235 | 54x |
if (version == "" || !grepl("^v\\d+$", version)) { |
236 | 1x |
stop("invalid version: ", version, call. = FALSE) |
237 |
} |
|
238 | 53x |
if (endpoint == "") { |
239 | 52x |
endpoint <- if (length(url_parts) > 2) { |
240 | ! |
url_parts[[3]] |
241 |
} else { |
|
242 | 4x |
if (tolower(version) == "v1") "framework" else "analyze" |
243 |
} |
|
244 |
} |
|
245 | 53x |
endpoint <- sub("^.*/", "", tolower(endpoint)) |
246 | 53x |
if (endpoint == "" || grepl("[^a-z]", endpoint)) { |
247 | 1x |
stop("invalid endpoint: ", endpoint, call. = FALSE) |
248 |
} |
|
249 | 52x |
url <- paste0(sub("/+[Vv]\\d+(/.*)?$|/+$", "", url), "/", version, "/") |
250 | 52x |
full_url <- paste0( |
251 | 52x |
url, |
252 | 52x |
endpoint, |
253 | 52x |
if (version == "v1") "/bulk" else paste0("/", context) |
254 |
) |
|
255 | ! |
if (!is.list(api_args)) api_args <- as.list(api_args) |
256 |
if ( |
|
257 | 52x |
version != "v1" && |
258 | 52x |
"context" %in% api_args && |
259 | 52x |
"custom_context" %in% api_args |
260 |
) { |
|
261 | ! |
stop( |
262 | ! |
"only one of `context` or `custom_context may be specified", |
263 | ! |
call. = FALSE |
264 |
) |
|
265 |
} |
|
266 | 52x |
if (version != "v1" && length(api_args)) { |
267 | ! |
full_url <- paste0( |
268 | ! |
full_url, |
269 |
"?", |
|
270 | ! |
paste0(names(api_args), "=", unlist(api_args), collapse = "&") |
271 |
) |
|
272 |
} |
|
273 |
} |
|
274 | 54x |
args_hash <- digest::digest( |
275 | 54x |
jsonlite::toJSON( |
276 | 54x |
c( |
277 | 54x |
api_args, |
278 | 54x |
url = full_url, |
279 | 54x |
key = key, |
280 | 54x |
secret = secret |
281 |
), |
|
282 | 54x |
auto_unbox = TRUE |
283 |
), |
|
284 | 54x |
serialize = FALSE |
285 |
) |
|
286 | ||
287 |
# ping API |
|
288 | 54x |
if (make_request) { |
289 | 4x |
if (verbose) message("pinging API (", round(proc.time()[[3]] - st, 4), ")") |
290 | 51x |
ping <- receptiviti_status(url, key, secret, verbose = FALSE) |
291 | 1x |
if (is.null(ping)) stop("URL is unreachable", call. = FALSE) |
292 | 1x |
if (ping$status_code != 200) stop(ping$status_message, call. = FALSE) |
293 |
} |
|
294 | ||
295 |
# prepare text |
|
296 | 4x |
if (verbose) message("preparing text (", round(proc.time()[[3]] - st, 4), ")") |
297 | 50x |
data <- data.frame(text = text, id = id, stringsAsFactors = FALSE) |
298 | 50x |
text <- data[!is.na(data$text) & data$text != "" & !duplicated(data$text), ] |
299 | 1x |
if (!nrow(text)) stop("no valid texts to process", call. = FALSE) |
300 | 1x |
if (!is.numeric(bundle_size)) bundle_size <- 1000 |
301 | 49x |
n_texts <- nrow(text) |
302 | 49x |
n <- ceiling(n_texts / min(1000, max(1, bundle_size))) |
303 | 49x |
bundles <- split(text, sort(rep_len(seq_len(n), nrow(text)))) |
304 | 49x |
size_fun <- if (text_as_paths) function(b) sum(file.size(b$text)) else |
305 | 49x |
object.size |
306 | 49x |
for (i in rev(seq_along(bundles))) { |
307 | 160x |
size <- size_fun(bundles[[i]]) |
308 | 160x |
if (size > bundle_byte_limit) { |
309 | 1x |
sizes <- vapply( |
310 | 1x |
seq_len(nrow(bundles[[i]])), |
311 | 1x |
function(r) as.numeric(size_fun(bundles[[i]][r, ])), |
312 | 1x |
0 |
313 |
) |
|
314 | 1x |
if (any(sizes > bundle_byte_limit)) { |
315 | 1x |
stop( |
316 | 1x |
"one of your texts is over the individual size limit (", |
317 | 1x |
bundle_byte_limit / 1024e3, |
318 | 1x |
" MB)", |
319 | 1x |
call. = FALSE |
320 |
) |
|
321 |
} |
|
322 | ! |
bins <- rep(1, length(sizes)) |
323 | ! |
bin_size <- 0 |
324 | ! |
bi <- 1 |
325 | ! |
for (ti in seq_along(bins)) { |
326 | ! |
bin_size <- bin_size + sizes[ti] |
327 | ! |
if (bin_size > bundle_byte_limit) { |
328 | ! |
bin_size <- sizes[ti] |
329 | ! |
bi <- bi + 1 |
330 |
} |
|
331 | ! |
bins[ti] <- bi |
332 |
} |
|
333 | ! |
bundles <- c( |
334 | ! |
bundles[-i], |
335 | ! |
unname(split(bundles[[i]], paste0(i, ".", bins))) |
336 |
) |
|
337 |
} |
|
338 |
} |
|
339 | 48x |
n_bundles <- length(bundles) |
340 | 48x |
bundle_ref <- if (n_bundles == 1) "bundle" else "bundles" |
341 | 48x |
if (verbose) |
342 | 4x |
message( |
343 | 4x |
"prepared text in ", |
344 | 4x |
n_bundles, |
345 |
" ", |
|
346 | 4x |
bundle_ref, |
347 |
" (", |
|
348 | 4x |
round(proc.time()[[3]] - st, 4), |
349 |
")" |
|
350 |
) |
|
351 | ||
352 | 48x |
auth <- paste0(key, ":", secret) |
353 | 48x |
if (is.null(in_memory) && (use_future || cores > 1) && n_bundles > cores) |
354 | ! |
in_memory <- FALSE |
355 | 48x |
request_scratch <- NULL |
356 | 48x |
if (!in_memory) { |
357 | 2x |
if (verbose) |
358 | ! |
message( |
359 | ! |
"writing ", |
360 | ! |
bundle_ref, |
361 | ! |
" to disc (", |
362 | ! |
round(proc.time()[[3]] - st, 4), |
363 |
")" |
|
364 |
) |
|
365 | 2x |
request_scratch <- paste0(tempdir(), "/receptiviti_request_scratch/") |
366 | 2x |
dir.create(request_scratch, FALSE) |
367 | 2x |
if (clear_scratch_cache) on.exit(unlink(request_scratch, recursive = TRUE)) |
368 | 2x |
bundles <- vapply( |
369 | 2x |
bundles, |
370 | 2x |
function(b) { |
371 | 6x |
scratch_bundle <- paste0(request_scratch, digest::digest(b), ".rds") |
372 | 6x |
if (!file.exists(scratch_bundle)) |
373 | 6x |
saveRDS(b, scratch_bundle, compress = FALSE) |
374 | 6x |
scratch_bundle |
375 |
}, |
|
376 |
"", |
|
377 | 2x |
USE.NAMES = FALSE |
378 |
) |
|
379 |
} |
|
380 | ||
381 | 48x |
doprocess <- function(bundles, cores, future) { |
382 | 7x |
env <- parent.frame() |
383 | 7x |
if (future) { |
384 | 1x |
eval( |
385 | 1x |
expression(future.apply::future_lapply(bundles, process)), |
386 | 1x |
envir = env |
387 |
) |
|
388 |
} else { |
|
389 | 6x |
cl <- parallel::makeCluster(cores) |
390 | 6x |
parallel::clusterExport(cl, ls(envir = env), env) |
391 | 6x |
on.exit(parallel::stopCluster(cl)) |
392 | 6x |
(if (length(bundles) > cores * 2) parallel::parLapplyLB else |
393 | 6x |
parallel::parLapply)(cl, bundles, process) |
394 |
} |
|
395 |
} |
|
396 | ||
397 | 48x |
request <- function(body, body_hash, bin, ids, attempt = retry_limit) { |
398 | 636x |
temp_file <- paste0(tempdir(), "/", body_hash, ".json") |
399 | 618x |
if (!request_cache) unlink(temp_file) |
400 | 636x |
res <- NULL |
401 | 636x |
if (!file.exists(temp_file)) { |
402 | 630x |
if (make_request) { |
403 | 629x |
handler <- tryCatch( |
404 | 629x |
curl::new_handle(httpauth = 1, userpwd = auth, copypostfields = body), |
405 | 629x |
error = function(e) e$message |
406 |
) |
|
407 | 629x |
if (is.character(handler)) { |
408 | ! |
stop( |
409 | ! |
if (grepl("libcurl", handler, fixed = TRUE)) { |
410 | ! |
"libcurl encountered an error; try setting the bundle_byte_limit argument to a smaller value" |
411 |
} else { |
|
412 | ! |
paste("failed to create handler:", handler) |
413 |
}, |
|
414 | ! |
call. = FALSE |
415 |
) |
|
416 |
} |
|
417 | 2x |
if (to_norming) curl::handle_setopt(handler, customrequest = "PATCH") |
418 | 629x |
res <- curl::curl_fetch_disk(full_url, temp_file, handler) |
419 |
} else { |
|
420 | 1x |
stop( |
421 | 1x |
"make_request is FALSE, but there are texts with no cached results", |
422 | 1x |
call. = FALSE |
423 |
) |
|
424 |
} |
|
425 |
} |
|
426 | 635x |
result <- if (file.exists(temp_file)) { |
427 | 635x |
if ( |
428 | 635x |
is.null(res$type) || grepl("application/json", res$type, fixed = TRUE) |
429 |
) { |
|
430 | 635x |
tryCatch( |
431 | 635x |
jsonlite::read_json(temp_file, simplifyVector = TRUE), |
432 | 635x |
error = function(e) list(message = "invalid response format") |
433 |
) |
|
434 |
} else { |
|
435 | ! |
list(message = "invalid response format") |
436 |
} |
|
437 |
} else { |
|
438 | ! |
list(message = rawToChar(res$content)) |
439 |
} |
|
440 | 635x |
valid_result <- if (to_norming) { |
441 | 2x |
!is.null(result$submitted) |
442 |
} else { |
|
443 | 633x |
!is.null(result$results) || is.null(result$message) |
444 |
} |
|
445 | 635x |
if (valid_result) { |
446 | 81x |
if (!is.null(result$results)) result <- result$results |
447 | 83x |
if ("error" %in% names(result)) { |
448 | ! |
if (!is.list(result$error)) { |
449 | ! |
warning("bundle ", body_hash, " failed: ", result$error) |
450 | ! |
} else if (is.list(result$error)) { |
451 | ! |
warning( |
452 | ! |
"bundle ", |
453 | ! |
body_hash, |
454 | ! |
" failed: ", |
455 | ! |
if (!is.null(result$error$code)) |
456 | ! |
paste0("(", result$error$code, ") ") else NULL, |
457 | ! |
result$error$message |
458 |
) |
|
459 |
} else { |
|
460 | ! |
su <- !is.na(result$error$code) |
461 | ! |
errors <- if (is.data.frame(result)) { |
462 | ! |
result[su & !duplicated(result$error$code), "error"] |
463 |
} else { |
|
464 | ! |
result$error |
465 |
} |
|
466 | ! |
warning( |
467 | ! |
if (sum(su) > 1) "some texts were invalid: " else |
468 | ! |
"a text was invalid: ", |
469 | ! |
paste( |
470 | ! |
do.call( |
471 | ! |
paste0, |
472 | ! |
data.frame( |
473 |
"(", |
|
474 | ! |
errors$code, |
475 |
") ", |
|
476 | ! |
errors$message, |
477 | ! |
stringsAsFactors = FALSE |
478 |
) |
|
479 |
), |
|
480 | ! |
collapse = "; " |
481 |
), |
|
482 | ! |
call. = FALSE |
483 |
) |
|
484 |
} |
|
485 |
} |
|
486 | 83x |
if (to_norming) { |
487 | 2x |
cbind(body_hash = body_hash, as.data.frame(result)) |
488 |
} else { |
|
489 | 81x |
unpack <- function(d) { |
490 | 814x |
if (is.list(d)) as.data.frame(lapply(d, unpack), optional = TRUE) else |
491 | 16204x |
d |
492 |
} |
|
493 | 81x |
result <- unpack(result[ |
494 | 81x |
!names(result) %in% c("response_id", "language", "version", "error") |
495 |
]) |
|
496 | 81x |
if (!is.null(result) && nrow(result)) { |
497 | 81x |
if (colnames(result)[[1]] == "request_id") { |
498 | 81x |
colnames(result)[[1]] <- "text_hash" |
499 |
} |
|
500 | 81x |
cbind(id = ids, bin = bin, result) |
501 |
} |
|
502 |
} |
|
503 |
} else { |
|
504 | 552x |
unlink(temp_file) |
505 | 552x |
if (length(result$message) == 1 && substr(result$message, 1, 1) == "{") { |
506 | ! |
result <- jsonlite::fromJSON(result$message) |
507 |
} |
|
508 | 48x |
if ( |
509 | 552x |
attempt > 0 && |
510 | 552x |
(length(result$code) == 1 && result$code == 1420) || |
511 | 552x |
(length(result$message) == 1 && |
512 | 552x |
result$message == "invalid response format") |
513 |
) { |
|
514 | 526x |
wait_time <- as.numeric(regmatches( |
515 | 526x |
result$message, |
516 | 526x |
regexec("[0-9]+(?:\\.[0-9]+)?", result$message) |
517 |
)) |
|
518 | 526x |
Sys.sleep(if (is.na(wait_time)) 1 else wait_time / 1e3) |
519 | 526x |
request(body, body_hash, bin, ids, attempt - 1) |
520 |
} else { |
|
521 | 26x |
message <- if (is.null(res$status_code)) 200 else res$status_code |
522 | 26x |
if (length(result$code)) |
523 | 26x |
message <- paste0(message, " (", result$code, "): ", result$message) |
524 | ! |
if (length(result$error)) message <- paste0(message, ": ", result$error) |
525 | 26x |
stop(message, call. = FALSE) |
526 |
} |
|
527 |
} |
|
528 |
} |
|
529 | ||
530 | 48x |
process <- function(bundle) { |
531 | 148x |
opts <- getOption("stringsAsFactors") |
532 | 148x |
options("stringsAsFactors" = FALSE) |
533 | 148x |
on.exit(options("stringsAsFactors" = opts)) |
534 | 6x |
if (is.character(bundle)) bundle <- readRDS(bundle) |
535 | 148x |
text <- bundle$text |
536 | 148x |
bin <- NULL |
537 | 148x |
if (text_as_paths) { |
538 | 4x |
if (all(grepl("\\.csv", text, TRUE))) { |
539 | 3x |
if (is.null(text_column)) |
540 | ! |
stop( |
541 | ! |
"files appear to be csv, but no text_column was specified", |
542 | ! |
call. = FALSE |
543 |
) |
|
544 | 3x |
text <- vapply( |
545 | 3x |
text, |
546 | 3x |
function(f) { |
547 | 51x |
tryCatch( |
548 | 51x |
paste( |
549 | 51x |
arrow::read_csv_arrow( |
550 | 51x |
f, |
551 | 51x |
read_options = arrow::CsvReadOptions$create( |
552 | 51x |
encoding = handle_encoding(f) |
553 |
), |
|
554 | 51x |
col_select = dplyr::all_of(text_column) |
555 | 51x |
)[[1]], |
556 | 51x |
collapse = " " |
557 |
), |
|
558 | 51x |
error = function(e) |
559 | 51x |
stop("failed to read in file ", f, call. = FALSE) |
560 |
) |
|
561 |
}, |
|
562 |
"" |
|
563 |
) |
|
564 |
} else { |
|
565 | 1x |
text <- vapply( |
566 | 1x |
text, |
567 | 1x |
function(f) { |
568 | 1x |
tryCatch( |
569 |
{ |
|
570 | 1x |
con <- file(f, encoding = handle_encoding(f)) |
571 | 1x |
on.exit(close(con)) |
572 | 1x |
paste( |
573 | 1x |
readLines(con, warn = FALSE, skipNul = TRUE), |
574 | 1x |
collapse = " " |
575 |
) |
|
576 |
}, |
|
577 | 1x |
error = function(e) |
578 | 1x |
stop("failed to read in file ", f, call. = FALSE) |
579 |
) |
|
580 |
}, |
|
581 |
"" |
|
582 |
) |
|
583 |
} |
|
584 |
} |
|
585 | 148x |
bundle$hashes <- paste0(vapply( |
586 | 148x |
paste0(args_hash, text), |
587 | 148x |
digest::digest, |
588 |
"", |
|
589 | 148x |
serialize = FALSE |
590 |
)) |
|
591 | 148x |
if (to_norming) { |
592 | 2x |
body <- jsonlite::toJSON( |
593 | 2x |
lapply( |
594 | 2x |
seq_along(text), |
595 | 2x |
function(i) list(text = text[[i]], request_id = bundle$hashes[[i]]) |
596 |
), |
|
597 | 2x |
auto_unbox = TRUE |
598 |
) |
|
599 | 2x |
res <- request( |
600 | 2x |
body, |
601 | 2x |
digest::digest(body, serialize = FALSE), |
602 | 2x |
initial, |
603 | 2x |
bundle$id |
604 |
) |
|
605 | 2x |
prog(amount = nrow(bundle)) |
606 |
} else { |
|
607 | 146x |
initial <- paste0("h", substr(bundle$hashes, 1, 1)) |
608 | 146x |
set <- !is.na(text) & |
609 | 146x |
text != "" & |
610 | 146x |
text != "logical(0)" & |
611 | 146x |
!duplicated(bundle$hashes) |
612 | 146x |
res_cached <- cached_cols <- res_fresh <- NULL |
613 | 146x |
nres <- ncached <- 0 |
614 | 146x |
check_cache <- !cache_overwrite && |
615 | 146x |
(cache != "" && length(list.dirs(cache))) |
616 | 146x |
if (check_cache) { |
617 | 42x |
db <- arrow::open_dataset( |
618 | 42x |
cache, |
619 | 42x |
partitioning = arrow::schema(bin = arrow::string()), |
620 | 42x |
format = cache_format |
621 |
) |
|
622 | 42x |
cached_cols <- colnames(db) |
623 | 42x |
cached <- if (!is.null(db$schema$GetFieldByName("text_hash"))) { |
624 | 40x |
text_hash <- NULL |
625 | 40x |
su <- dplyr::filter( |
626 | 40x |
db, |
627 | 40x |
bin %in% unique(initial), |
628 | 40x |
text_hash %in% bundle$hashes |
629 |
) |
|
630 | 40x |
tryCatch( |
631 | 40x |
dplyr::compute( |
632 | 40x |
if (collect_results) su else dplyr::select(su, text_hash) |
633 |
), |
|
634 | 40x |
error = function(e) matrix(integer(), 0) |
635 |
) |
|
636 |
} else { |
|
637 | 2x |
matrix(integer(), 0) |
638 |
} |
|
639 | 42x |
ncached <- nrow(cached) |
640 | 42x |
if (ncached) { |
641 | 38x |
cached <- as.data.frame(cached$to_data_frame()) |
642 | 38x |
if (anyDuplicated(cached$text_hash)) |
643 | ! |
cached <- cached[!duplicated(cached$text_hash), ] |
644 | 38x |
rownames(cached) <- cached$text_hash |
645 | 38x |
cached_set <- which(bundle$hashes %in% cached$text_hash) |
646 | 38x |
set[cached_set] <- FALSE |
647 | 38x |
if (collect_results) { |
648 | 38x |
res_cached <- cbind( |
649 | 38x |
id = bundle$id[cached_set], |
650 | 38x |
cached[bundle$hashes[cached_set], ] |
651 |
) |
|
652 |
} |
|
653 |
} |
|
654 |
} |
|
655 | 146x |
valid_options <- names(api_args) |
656 | 146x |
if (any(set)) { |
657 | 108x |
set <- which(set) |
658 | 108x |
make_bundle <- if (version == "v1") { |
659 | 104x |
function(i) { |
660 | 202x |
c( |
661 | 202x |
api_args, |
662 | 202x |
list(content = text[[i]], request_id = bundle$hashes[[i]]) |
663 |
) |
|
664 |
} |
|
665 |
} else { |
|
666 | 4x |
function(i) { |
667 | 4x |
list(text = text[[i]], request_id = bundle$hashes[[i]]) |
668 |
} |
|
669 |
} |
|
670 | 108x |
body <- jsonlite::toJSON( |
671 | 108x |
unname(lapply(set, make_bundle)), |
672 | 108x |
auto_unbox = TRUE |
673 |
) |
|
674 | 108x |
body_hash <- digest::digest(body, serialize = FALSE) |
675 | 108x |
res_fresh <- request(body, body_hash, initial[set], bundle$id[set]) |
676 | 81x |
valid_options <- valid_options[valid_options %in% colnames(res_fresh)] |
677 | 81x |
if (length(valid_options)) { |
678 | 1x |
res_fresh <- res_fresh[, |
679 | 1x |
!colnames(res_fresh) %in% valid_options, |
680 | 1x |
drop = FALSE |
681 |
] |
|
682 |
} |
|
683 | 81x |
if (ncached && !all(cached_cols %in% colnames(res_fresh))) { |
684 | ! |
res_cached <- NULL |
685 | ! |
ncached <- 0 |
686 | ! |
body <- jsonlite::toJSON( |
687 | ! |
lapply(cached_set, make_bundle), |
688 | ! |
auto_unbox = TRUE |
689 |
) |
|
690 | ! |
res_fresh <- rbind( |
691 | ! |
res_fresh, |
692 | ! |
request( |
693 | ! |
body, |
694 | ! |
digest::digest(body, serialize = FALSE), |
695 | ! |
initial[cached_set], |
696 | ! |
bundle$id[cached_set] |
697 |
) |
|
698 |
) |
|
699 |
} |
|
700 | 81x |
nres <- nrow(res_fresh) |
701 | 81x |
if (cache != "" && nres) { |
702 | 4x |
writer <- if (cache_format == "parquet") arrow::write_parquet else |
703 | 4x |
arrow::write_feather |
704 | 4x |
cols <- vapply( |
705 | 4x |
res_fresh[, |
706 | 4x |
!(colnames(res_fresh) %in% c("id", "bin", names(api_args))) |
707 |
], |
|
708 | 4x |
is.character, |
709 | 4x |
TRUE |
710 |
) |
|
711 | 4x |
schema <- list() |
712 | 4x |
for (v in names(cols)) { |
713 | 796x |
schema[[v]] <- if (cols[[v]]) { |
714 | 4x |
arrow::string() |
715 | 796x |
} else if ( |
716 | 796x |
v %in% c("summary.word_count", "summary.sentence_count") |
717 |
) { |
|
718 | 8x |
if (anyNA(res_fresh[[v]])) |
719 | ! |
res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_integer_ |
720 | 8x |
arrow::int32() |
721 |
} else { |
|
722 | 784x |
if (anyNA(res_fresh[[v]])) |
723 | 4x |
res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_real_ |
724 | 784x |
arrow::float64() |
725 |
} |
|
726 |
} |
|
727 | 4x |
schema <- arrow::schema(schema) |
728 | 4x |
for (part_bin in unique(res_fresh$bin)) { |
729 | 18x |
part <- res_fresh[res_fresh$bin == part_bin, ] |
730 | 18x |
part$id <- NULL |
731 | 18x |
part$bin <- NULL |
732 | 18x |
bin_dir <- paste0(cache, "/bin=", part_bin, "/") |
733 | 18x |
dir.create(bin_dir, FALSE, TRUE) |
734 | 18x |
writer( |
735 | 18x |
arrow::as_arrow_table(part, schema = schema), |
736 | 18x |
paste0(bin_dir, "fragment-", body_hash, "-0.", cache_format) |
737 |
) |
|
738 |
} |
|
739 |
} |
|
740 |
} |
|
741 | 119x |
if (collect_results) { |
742 | 119x |
res <- rbind(res_cached, res_fresh) |
743 | 119x |
if (length(valid_options)) |
744 | 1x |
for (n in valid_options) res[[n]] <- api_args[[n]] |
745 | 119x |
missing_ids <- !bundle$id %in% res$id |
746 | 119x |
if (any(missing_ids)) { |
747 | 1x |
varnames <- colnames(res)[colnames(res) != "id"] |
748 | 1x |
res <- rbind( |
749 | 1x |
res, |
750 | 1x |
cbind( |
751 | 1x |
id = bundle$id[missing_ids], |
752 | 1x |
as.data.frame(matrix( |
753 | 1x |
NA, |
754 | 1x |
sum(missing_ids), |
755 | 1x |
length(varnames), |
756 | 1x |
dimnames = list(NULL, varnames) |
757 |
)) |
|
758 |
) |
|
759 |
) |
|
760 | 1x |
res$text_hash <- structure(bundle$hashes, names = bundle$id)[res$id] |
761 |
} |
|
762 |
} |
|
763 | 119x |
prog(amount = nres + ncached) |
764 |
} |
|
765 | ! |
if (collect_results) res else NULL |
766 |
} |
|
767 | ||
768 |
# make request(s) |
|
769 | 48x |
cores <- if (is.numeric(cores)) max(1, min(n_bundles, cores)) else 1 |
770 | 48x |
prog <- progressor(n_texts) |
771 | 48x |
results <- if (use_future || cores > 1) { |
772 | 7x |
call_env <- new.env(parent = globalenv()) |
773 | 7x |
environment(doprocess) <- call_env |
774 | 7x |
environment(request) <- call_env |
775 | 7x |
environment(process) <- call_env |
776 | 7x |
for (name in c( |
777 | 7x |
"doprocess", |
778 | 7x |
"request", |
779 | 7x |
"process", |
780 | 7x |
"text_column", |
781 | 7x |
"prog", |
782 | 7x |
"make_request", |
783 | 7x |
"full_url", |
784 | 7x |
"cache", |
785 | 7x |
"cache_overwrite", |
786 | 7x |
"use_future", |
787 | 7x |
"cores", |
788 | 7x |
"bundles", |
789 | 7x |
"cache_format", |
790 | 7x |
"request_cache", |
791 | 7x |
"auth", |
792 | 7x |
"version", |
793 | 7x |
"to_norming", |
794 | 7x |
"text_as_paths", |
795 | 7x |
"retry_limit", |
796 | 7x |
"api_args", |
797 | 7x |
"args_hash", |
798 | 7x |
"encoding", |
799 | 7x |
"handle_encoding", |
800 | 7x |
"collect_results" |
801 |
)) { |
|
802 | 168x |
call_env[[name]] <- get(name) |
803 |
} |
|
804 | 7x |
if (verbose) { |
805 | ! |
message( |
806 | ! |
"processing ", |
807 | ! |
bundle_ref, |
808 | ! |
" using ", |
809 | ! |
if (use_future) "future backend" else paste(cores, "cores"), |
810 |
" (", |
|
811 | ! |
round(proc.time()[[3]] - st, 4), |
812 |
")" |
|
813 |
) |
|
814 |
} |
|
815 | 7x |
eval(expression(doprocess(bundles, cores, use_future)), envir = call_env) |
816 |
} else { |
|
817 | 41x |
if (verbose) |
818 | 4x |
message( |
819 | 4x |
"processing ", |
820 | 4x |
bundle_ref, |
821 | 4x |
" sequentially (", |
822 | 4x |
round(proc.time()[[3]] - st, 4), |
823 |
")" |
|
824 |
) |
|
825 | 41x |
lapply(bundles, process) |
826 |
} |
|
827 | 46x |
if (verbose) |
828 | 4x |
message("done retrieving (", round(proc.time()[[3]] - st, 4), ")") |
829 | 46x |
if (collect_results) { |
830 | 46x |
final_res <- do.call(rbind, results) |
831 | 46x |
list(data = data, final_res = final_res, provided_id = provided_id) |
832 |
} else { |
|
833 | ! |
NULL |
834 |
} |
|
835 |
} |
1 |
.onLoad <- function(lib, pkg) { |
|
2 | ! |
if (Sys.getenv("RECEPTIVITI_URL") == "") |
3 | ! |
Sys.setenv(RECEPTIVITI_URL = "https://api.receptiviti.com/") |
4 |
} |
|
5 | ||
6 |
#' Receptiviti API |
|
7 |
#' |
|
8 |
#' The main function to access the \href{https://www.receptiviti.com}{Receptiviti} API. |
|
9 |
#' |
|
10 |
#' @param text A character vector with text to be processed, path to a directory containing files, or a vector of file paths. |
|
11 |
#' If a single path to a directory, each file is collapsed to a single text. If a path to a file or files, |
|
12 |
#' each line or row is treated as a separate text, unless \code{collapse_lines} is \code{TRUE} (in which case, |
|
13 |
#' files will be read in as part of bundles at processing time, as is always the case when a directory). |
|
14 |
#' Use \code{files} to more reliably enter files, or \code{dir} to more reliably specify a directory. |
|
15 |
#' @param output Path to a \code{.csv} file to write results to. If this already exists, set \code{overwrite} to \code{TRUE} |
|
16 |
#' to overwrite it. |
|
17 |
#' @param id Vector of unique IDs the same length as \code{text}, to be included in the results. |
|
18 |
#' @param text_column,id_column Column name of text/id, if \code{text} is a matrix-like object, or a path to a csv file. |
|
19 |
#' @param files A list of file paths, as alternate entry to \code{text}. |
|
20 |
#' @param dir A directory to search for files in, as alternate entry to \code{text}. |
|
21 |
#' @param file_type File extension to search for, if \code{text} is the path to a directory containing files to be read in. |
|
22 |
#' @param encoding Encoding of file(s) to be read in. If not specified, this will be detected, which can fail, |
|
23 |
#' resulting in mis-encoded characters; for best (and fasted) results, specify encoding. |
|
24 |
#' @param return_text Logical; if \code{TRUE}, \code{text} is included as the first column of the result. |
|
25 |
#' @param context Name of the analysis context. |
|
26 |
#' @param custom_context Name of a custom context (as listed by \code{\link{receptiviti_norming}}), |
|
27 |
#' or \code{TRUE} if \code{context} is the name of a custom context. |
|
28 |
#' @param api_args A list of additional arguments to pass to the API (e.g., \code{list(sallee_mode = "sparse")}). Defaults to the |
|
29 |
#' \code{receptiviti.api_args} option. Custom norming contexts can be established with the \code{\link{receptiviti_norming}} |
|
30 |
#' function, then referred to here with the \code{custom_context} argument (only available in API V2). |
|
31 |
#' @param frameworks A vector of frameworks to include results from. Texts are always scored with all available framework -- |
|
32 |
#' this just specifies what to return. Defaults to \code{all}, to return all scored frameworks. Can be set by the |
|
33 |
#' \code{receptiviti.frameworks} option (e.g., \code{options(receptiviti.frameworks = c("liwc", "sallee"))}). |
|
34 |
#' @param framework_prefix Logical; if \code{FALSE}, will remove the framework prefix from column names, which may result in duplicates. |
|
35 |
#' If this is not specified, and 1 framework is selected, or \code{as_list} is \code{TRUE}, will default to remove prefixes. |
|
36 |
#' @param as_list Logical; if \code{TRUE}, returns a list with frameworks in separate entries. |
|
37 |
#' @param bundle_size Number of texts to include in each request; between 1 and 1,000. |
|
38 |
#' @param bundle_byte_limit Memory limit (in bytes) of each bundle, under \code{1e7} (10 MB, which is the API's limit). |
|
39 |
#' May need to be lower than the API's limit, depending on the system's requesting library. |
|
40 |
#' @param collapse_lines Logical; if \code{TRUE}, and \code{text} contains paths to files, each file is treated as a single text. |
|
41 |
#' @param retry_limit Maximum number of times each request can be retried after hitting a rate limit. |
|
42 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{output} file. |
|
43 |
#' @param compress Logical; if \code{TRUE}, will save as an \code{xz}-compressed file. |
|
44 |
#' @param make_request Logical; if \code{FALSE}, a request is not made. This could be useful if you want to be sure and |
|
45 |
#' load from one of the caches, but aren't sure that all results exist there; it will error out if it encounters |
|
46 |
#' texts it has no other source for. |
|
47 |
#' @param text_as_paths Logical; if \code{TRUE}, ensures \code{text} is treated as a vector of file paths. Otherwise, this will be |
|
48 |
#' determined if there are no \code{NA}s in \code{text} and every entry is under 500 characters long. |
|
49 |
#' @param cache Path to a directory in which to save unique results for reuse; defaults to |
|
50 |
#' \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}. See the Cache section for details. |
|
51 |
#' @param cache_overwrite Logical; if \code{TRUE}, will write results to the cache without reading from it. This could be used |
|
52 |
#' if you want fresh results to be cached without clearing the cache. |
|
53 |
#' @param cache_format Format of the cache database; see \code{\link[arrow]{FileFormat}}. |
|
54 |
#' Defaults to \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE_FORMAT")}. |
|
55 |
#' @param clear_cache Logical; if \code{TRUE}, will clear any existing files in the cache. Use \code{cache_overwrite} if |
|
56 |
#' you want fresh results without clearing or disabling the cache. Use \code{cache = FALSE} to disable the cache. |
|
57 |
#' @param request_cache Logical; if \code{FALSE}, will always make a fresh request, rather than using the response |
|
58 |
#' from a previous identical request. |
|
59 |
#' @param cores Number of CPU cores to split bundles across, if there are multiple bundles. See the Parallelization section. |
|
60 |
#' @param collect_results Logical; if \code{FALSE}, will not retain bundle results in memory for return. |
|
61 |
#' @param use_future Logical; if \code{TRUE}, uses a \code{future} back-end to process bundles, in which case, |
|
62 |
#' parallelization can be controlled with the \code{\link[future]{plan}} function (e.g., \code{plan("multisession")} |
|
63 |
#' to use multiple cores); this is required to see progress bars when using multiple cores. See the Parallelization section. |
|
64 |
#' @param in_memory Logical; if \code{FALSE}, will write bundles to temporary files, and only load them as they are being requested. |
|
65 |
#' @param clear_scratch_cache Logical; if \code{FALSE}, will preserve the bundles written when \code{in_memory} is \code{TRUE}, after |
|
66 |
#' the request has been made. |
|
67 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
68 |
#' @param key API Key; defaults to \code{Sys.getenv("RECEPTIVITI_KEY")}. |
|
69 |
#' @param secret API Secret; defaults to \code{Sys.getenv("RECEPTIVITI_SECRET")}. |
|
70 |
#' @param url API URL; defaults to \code{Sys.getenv("RECEPTIVITI_URL")}, which defaults to |
|
71 |
#' \code{"https://api.receptiviti.com/"}. |
|
72 |
#' @param version API version; defaults to \code{Sys.getenv("RECEPTIVITI_VERSION")}, which defaults to |
|
73 |
#' \code{"v1"}. |
|
74 |
#' @param endpoint API endpoint (path name after the version); defaults to \code{Sys.getenv("RECEPTIVITI_ENDPOINT")}, |
|
75 |
#' which defaults to \code{"framework"}. |
|
76 |
#' @param include_headers Logical; if \code{TRUE}, \code{receptiviti_status}'s verbose message will include |
|
77 |
#' the HTTP headers. |
|
78 |
#' |
|
79 |
#' @returns Nothing if \code{collect_results} is \code{FALSE}. |
|
80 |
#' Otherwise, a \code{data.frame} with columns for \code{text} (if \code{return_text} is \code{TRUE}; the originally entered text), |
|
81 |
#' \code{id} (if one was provided), \code{text_hash} (the MD5 hash of the text), a column each for relevant entries in \code{api_args}, |
|
82 |
#' and scores from each included framework (e.g., \code{summary.word_count} and \code{liwc.i}). If \code{as_list} is \code{TRUE}, |
|
83 |
#' returns a list with a named entry containing such a \code{data.frame} for each framework. |
|
84 |
#' |
|
85 |
#' @section Request Process: |
|
86 |
#' This function (along with the internal \code{manage_request} function) handles texts and results in several steps: |
|
87 |
#' \enumerate{ |
|
88 |
#' \item Prepare bundles (split \code{text} into <= \code{bundle_size} and <= \code{bundle_byte_limit} bundles). |
|
89 |
#' \enumerate{ |
|
90 |
#' \item If \code{text} points to a directory or list of files, these will be read in later. |
|
91 |
#' \item If \code{in_memory} is \code{FALSE}, bundles are written to a temporary location, |
|
92 |
#' and read back in when the request is made. |
|
93 |
#' } |
|
94 |
#' \item Get scores for texts within each bundle. |
|
95 |
#' \enumerate{ |
|
96 |
#' \item If texts are paths, or \code{in_memory} is \code{FALSE}, will load texts. |
|
97 |
#' \item If \code{cache} is set, will skip any texts with cached scores. |
|
98 |
#' \item If \code{request_cache} is \code{TRUE}, will check for a cached request. |
|
99 |
#' \item If any texts need scoring and \code{make_request} is \code{TRUE}, will send unscored texts to the API. |
|
100 |
#' } |
|
101 |
#' \item If a request was made and \code{request_cache} is set, will cache the response. |
|
102 |
#' \item If \code{cache} is set, will write bundle scores to the cache. |
|
103 |
#' \item After requests are made, if \code{cache} is set, will defragment the cache |
|
104 |
#' (combine bundle results within partitions). |
|
105 |
#' \item If \code{collect_results} is \code{TRUE}, will prepare results: |
|
106 |
#' \enumerate{ |
|
107 |
#' \item Will realign results with \code{text} (and \code{id} if provided). |
|
108 |
#' \item If \code{output} is specified, will write realigned results to it. |
|
109 |
#' \item Will drop additional columns (such as \code{custom} and \code{id} if not provided). |
|
110 |
#' \item If \code{framework} is specified, will use it to select columns of the results. |
|
111 |
#' \item Returns results. |
|
112 |
#' } |
|
113 |
#' } |
|
114 |
#' |
|
115 |
#' @section Cache: |
|
116 |
#' If the \code{cache} argument is specified, results for unique texts are saved in an |
|
117 |
#' \href{https://arrow.apache.org}{Arrow} database in the cache location |
|
118 |
#' (\code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}), and are retrieved with subsequent requests. |
|
119 |
#' This ensures that the exact same texts are not re-sent to the API. |
|
120 |
#' This does, however, add some processing time and disc space usage. |
|
121 |
#' |
|
122 |
#' If \code{cache} is \code{TRUE}, a default directory (\code{receptiviti_cache}) will be looked for |
|
123 |
#' in the system's temporary directory (which is usually the parent of \code{tempdir()}). |
|
124 |
#' If this does not exist, you will be asked if it should be created. |
|
125 |
#' |
|
126 |
#' The primary cache is checked when each bundle is processed, and existing results are loaded at |
|
127 |
#' that time. When processing many bundles in parallel, and many results have been cached, |
|
128 |
#' this can cause the system to freeze and potentially crash. |
|
129 |
#' To avoid this, limit the number of cores, or disable parallel processing. |
|
130 |
#' |
|
131 |
#' The \code{cache_format} arguments (or the \code{RECEPTIVITI_CACHE_FORMAT} environment variable) can be used to adjust the format of the cache. |
|
132 |
#' |
|
133 |
#' You can use the cache independently with \code{open_database(Sys.getenv("RECEPTIVITI_CACHE"))}. |
|
134 |
#' |
|
135 |
#' You can also set the \code{clear_cache} argument to \code{TRUE} to clear the cache before it is used again, which may be useful |
|
136 |
#' if the cache has gotten big, or you know new results will be returned. Even if a cached result exists, it will be |
|
137 |
#' reprocessed if it does not have all of the variables of new results, but this depends on there being at least 1 uncached |
|
138 |
#' result. If, for instance, you add a framework to your account and want to reprocess a previously processed set of texts, |
|
139 |
#' you would need to first clear the cache. |
|
140 |
#' |
|
141 |
#' Either way, duplicated texts within the same call will only be sent once. |
|
142 |
#' |
|
143 |
#' The \code{request_cache} argument controls a more temporary cache of each bundle request. This is cleared when the |
|
144 |
#' R session ends. You might want to set this to \code{FALSE} if a new framework becomes available on your account |
|
145 |
#' and you want to process a set of text you already processed in the current R session without restarting. |
|
146 |
#' |
|
147 |
#' Another temporary cache is made when \code{in_memory} is \code{FALSE}, which is the default when processing |
|
148 |
#' in parallel (when \code{cores} is over \code{1} or \code{use_future} is \code{TRUE}). This contains |
|
149 |
#' a file for each unique bundle, which is read in as needed by the parallel workers. |
|
150 |
#' |
|
151 |
#' @section Parallelization: |
|
152 |
#' \code{text}s are split into bundles based on the \code{bundle_size} argument. Each bundle represents |
|
153 |
#' a single request to the API, which is why they are limited to 1000 texts and a total size of 10 MB. |
|
154 |
#' When there is more than one bundle and either \code{cores} is greater than 1 or \code{use_future} is \code{TRUE} (and you've |
|
155 |
#' externally specified a \code{\link[future]{plan}}), bundles are processed by multiple cores. |
|
156 |
#' |
|
157 |
#' If you have texts spread across multiple files, they can be most efficiently processed in parallel |
|
158 |
#' if each file contains a single text (potentially collapsed from multiple lines). If files contain |
|
159 |
#' multiple texts (i.e., \code{collapse_lines = FALSE}), then texts need to be read in before bundling |
|
160 |
#' in order to ensure bundles are under the length limit. |
|
161 |
#' |
|
162 |
#' Whether processing in serial or parallel, progress bars can be specified externally with |
|
163 |
#' \code{\link[progressr]{handlers}}; see examples. |
|
164 |
#' @examples |
|
165 |
#' \dontrun{ |
|
166 |
#' |
|
167 |
#' # check that the API is available, and your credentials work |
|
168 |
#' receptiviti_status() |
|
169 |
#' |
|
170 |
#' # score a single text |
|
171 |
#' single <- receptiviti("a text to score") |
|
172 |
#' |
|
173 |
#' # score multiple texts, and write results to a file |
|
174 |
#' multi <- receptiviti(c("first text to score", "second text"), "filename.csv") |
|
175 |
#' |
|
176 |
#' # score many texts in separate files |
|
177 |
#' ## defaults to look for .txt files |
|
178 |
#' file_results <- receptiviti(dir = "./path/to/txt_folder") |
|
179 |
#' |
|
180 |
#' ## could be .csv |
|
181 |
#' file_results <- receptiviti( |
|
182 |
#' dir = "./path/to/csv_folder", |
|
183 |
#' text_column = "text", file_type = "csv" |
|
184 |
#' ) |
|
185 |
#' |
|
186 |
#' # score many texts from a file, with a progress bar |
|
187 |
#' ## set up cores and progress bar |
|
188 |
#' ## (only necessary if you want the progress bar) |
|
189 |
#' future::plan("multisession") |
|
190 |
#' progressr::handlers(global = TRUE) |
|
191 |
#' progressr::handlers("progress") |
|
192 |
#' |
|
193 |
#' ## make request |
|
194 |
#' results <- receptiviti( |
|
195 |
#' "./path/to/largefile.csv", |
|
196 |
#' text_column = "text", use_future = TRUE |
|
197 |
#' ) |
|
198 |
#' } |
|
199 |
#' @importFrom curl new_handle curl_fetch_memory curl_fetch_disk handle_setopt |
|
200 |
#' @importFrom jsonlite toJSON fromJSON read_json |
|
201 |
#' @importFrom utils object.size |
|
202 |
#' @importFrom digest digest |
|
203 |
#' @importFrom parallel detectCores makeCluster clusterExport parLapplyLB parLapply stopCluster |
|
204 |
#' @importFrom progressr progressor |
|
205 |
#' @importFrom stringi stri_enc_detect |
|
206 |
#' @export |
|
207 | ||
208 |
receptiviti <- function( |
|
209 |
text = NULL, |
|
210 |
output = NULL, |
|
211 |
id = NULL, |
|
212 |
text_column = NULL, |
|
213 |
id_column = NULL, |
|
214 |
files = NULL, |
|
215 |
dir = NULL, |
|
216 |
file_type = "txt", |
|
217 |
encoding = NULL, |
|
218 |
return_text = FALSE, |
|
219 |
context = "written", |
|
220 |
custom_context = FALSE, |
|
221 |
api_args = getOption("receptiviti.api_args", list()), |
|
222 |
frameworks = getOption("receptiviti.frameworks", "all"), |
|
223 |
framework_prefix = TRUE, |
|
224 |
as_list = FALSE, |
|
225 |
bundle_size = 1000, |
|
226 |
bundle_byte_limit = 75e5, |
|
227 |
collapse_lines = FALSE, |
|
228 |
retry_limit = 50, |
|
229 |
clear_cache = FALSE, |
|
230 |
clear_scratch_cache = TRUE, |
|
231 |
request_cache = TRUE, |
|
232 |
cores = detectCores() - 1, |
|
233 |
collect_results = TRUE, |
|
234 |
use_future = FALSE, |
|
235 |
in_memory = TRUE, |
|
236 |
verbose = FALSE, |
|
237 |
overwrite = FALSE, |
|
238 |
compress = FALSE, |
|
239 |
make_request = TRUE, |
|
240 |
text_as_paths = FALSE, |
|
241 |
cache = Sys.getenv("RECEPTIVITI_CACHE"), |
|
242 |
cache_overwrite = FALSE, |
|
243 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"), |
|
244 |
key = Sys.getenv("RECEPTIVITI_KEY"), |
|
245 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), |
|
246 |
url = Sys.getenv("RECEPTIVITI_URL"), |
|
247 |
version = Sys.getenv("RECEPTIVITI_VERSION"), |
|
248 |
endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT") |
|
249 |
) { |
|
250 |
# check input |
|
251 | 72x |
if (!is.null(output)) { |
252 | 4x |
if (!file.exists(output) && file.exists(paste0(output, ".xz"))) |
253 | ! |
output <- paste0(output, ".xz") |
254 | 4x |
if (!overwrite && file.exists(output)) |
255 | 1x |
stop( |
256 | 1x |
"output file already exists; use overwrite = TRUE to overwrite it", |
257 | 1x |
call. = FALSE |
258 |
) |
|
259 |
} |
|
260 | 71x |
if (isTRUE(cache)) { |
261 | 2x |
temp <- dirname(tempdir()) |
262 | ! |
if (basename(temp) == "working_dir") temp <- dirname(dirname(temp)) |
263 | 2x |
cache <- paste0(temp, "/receptiviti_cache") |
264 | 2x |
if (!dir.exists(cache)) { |
265 |
if ( |
|
266 | 2x |
interactive() && |
267 | 2x |
!isFALSE(getOption("receptiviti.cache_prompt")) && |
268 | 2x |
grepl( |
269 | 2x |
"^(?:[Yy1]|$)", |
270 | 2x |
readline("Do you want to establish a default cache? [Y/n] ") |
271 |
) |
|
272 |
) { |
|
273 |
} else { |
|
274 | 2x |
options(receptiviti.cache_prompt = FALSE) |
275 | 2x |
cache <- "" |
276 |
} |
|
277 |
} |
|
278 |
} |
|
279 | ! |
if (!is.character(cache)) cache <- "" |
280 | 71x |
if (cache != "") { |
281 | 31x |
if (!requireNamespace("arrow", quietly = TRUE)) { |
282 | ! |
stop("install the `arrow` package to enable the cache", call. = FALSE) |
283 |
} |
|
284 | 31x |
if (!(cache_format %in% c("parquet", "feather"))) { |
285 | ! |
stop("cache format can only be `parquet` or `feather`", call. = FALSE) |
286 |
} |
|
287 | 2x |
if (clear_cache) unlink(cache, TRUE) |
288 | 31x |
dir.create(cache, FALSE, TRUE) |
289 | 31x |
cached_parts <- list.files( |
290 | 31x |
cache, |
291 | 31x |
cache_format, |
292 | 31x |
recursive = TRUE, |
293 | 31x |
full.names = TRUE |
294 |
) |
|
295 |
} |
|
296 | 71x |
st <- proc.time()[[3]] |
297 | 71x |
if (is.character(custom_context)) { |
298 | 3x |
context <- custom_context |
299 | 3x |
custom_context <- TRUE |
300 |
} |
|
301 | 71x |
if (context != "written") { |
302 | 4x |
norming_status <- receptiviti_norming( |
303 | 4x |
name_only = TRUE, |
304 | 4x |
url = url, |
305 | 4x |
key = key, |
306 | 4x |
secret = secret, |
307 | 4x |
verbose = FALSE |
308 |
) |
|
309 | 4x |
if (verbose) { |
310 | ! |
message( |
311 | ! |
"retrieved custom norming context list (", |
312 | ! |
round(proc.time()[[3]] - st, 4), |
313 |
")" |
|
314 |
) |
|
315 |
} |
|
316 | 4x |
context_id <- if (custom_context) paste0("custom/", context) else context |
317 | 4x |
if (!length(norming_status) || !(context_id %in% norming_status)) { |
318 | 2x |
stop( |
319 | 2x |
"custom norming context ", |
320 | 2x |
context, |
321 | 2x |
" is not on record or is not complete", |
322 | 2x |
call. = FALSE |
323 |
) |
|
324 |
} |
|
325 |
} |
|
326 |
if ( |
|
327 | 69x |
length(frameworks) && |
328 | 69x |
!("all" %in% frameworks) && |
329 | 69x |
grepl("2", version, fixed = TRUE) |
330 |
) { |
|
331 | 1x |
api_args$frameworks <- paste0( |
332 | 1x |
frameworks[frameworks != "summary"], |
333 | 1x |
collapse = "," |
334 |
) |
|
335 |
} |
|
336 | 69x |
if (!is.null(api_args$frameworks)) { |
337 | 1x |
available_frameworks <- c( |
338 | 1x |
"summary", |
339 | 1x |
receptiviti_frameworks(url, key, secret) |
340 |
) |
|
341 | 1x |
if (verbose) |
342 | ! |
message( |
343 | ! |
"retrived frameworks list (", |
344 | ! |
round(proc.time()[[3]] - st, 4), |
345 |
")" |
|
346 |
) |
|
347 | 1x |
arg_frameworks <- unlist(strsplit(api_args$frameworks, ",", fixed = TRUE)) |
348 | 1x |
su <- !(arg_frameworks %in% available_frameworks) |
349 | 1x |
if (any(su)) { |
350 | 1x |
stop( |
351 | 1x |
"requested framework(s) are not available to your account: ", |
352 | 1x |
paste(arg_frameworks[su], collapse = ", "), |
353 | 1x |
call. = FALSE |
354 |
) |
|
355 |
} |
|
356 |
} |
|
357 | 68x |
res <- manage_request( |
358 | 68x |
text, |
359 | 68x |
id = id, |
360 | 68x |
text_column = text_column, |
361 | 68x |
id_column = id_column, |
362 | 68x |
files = files, |
363 | 68x |
dir = dir, |
364 | 68x |
file_type = file_type, |
365 | 68x |
encoding = encoding, |
366 | 68x |
context = if (custom_context) paste0("custom/", context) else context, |
367 | 68x |
api_args = api_args, |
368 | 68x |
bundle_size = bundle_size, |
369 | 68x |
bundle_byte_limit = bundle_byte_limit, |
370 | 68x |
collapse_lines = collapse_lines, |
371 | 68x |
retry_limit = retry_limit, |
372 | 68x |
clear_scratch_cache = clear_scratch_cache, |
373 | 68x |
request_cache = request_cache, |
374 | 68x |
cores = cores, |
375 | 68x |
use_future = use_future, |
376 | 68x |
in_memory = in_memory, |
377 | 68x |
verbose = verbose, |
378 | 68x |
make_request = make_request, |
379 | 68x |
text_as_paths = text_as_paths, |
380 | 68x |
cache = cache, |
381 | 68x |
cache_overwrite = cache_overwrite, |
382 | 68x |
cache_format = cache_format, |
383 | 68x |
key = key, |
384 | 68x |
secret = secret, |
385 | 68x |
url = url, |
386 | 68x |
version = version, |
387 | 68x |
endpoint = endpoint |
388 |
) |
|
389 | 44x |
data <- res$data |
390 | 44x |
final_res <- res$final_res |
391 | ||
392 |
# defragment cache |
|
393 | 44x |
if (cache != "") { |
394 | 29x |
cache <- normalizePath(cache, "/", FALSE) |
395 | 29x |
exclude <- c("id", "bin", names(api_args)) |
396 | 29x |
bin_dirs <- list.dirs(cache) |
397 | 29x |
if (length(bin_dirs) > 1) { |
398 | 29x |
if (verbose) |
399 | 2x |
message("defragmenting cache (", round(proc.time()[[3]] - st, 4), ")") |
400 | 29x |
write_time <- as.numeric(Sys.time()) |
401 | 29x |
for (bin_dir in bin_dirs[-1]) { |
402 | 295x |
files <- list.files(bin_dir, cache_format, full.names = TRUE) |
403 | 295x |
if (length(files) > 1) { |
404 | 2x |
previous <- files[!(files %in% cached_parts)] |
405 | 2x |
if (collect_results && length(previous)) { |
406 | 2x |
existing_cols <- unique(c( |
407 | 2x |
"id", |
408 | 2x |
"bin", |
409 | 2x |
names(arrow::schema(arrow::open_dataset( |
410 | 2x |
previous[[1]], |
411 | 2x |
format = cache_format |
412 |
))) |
|
413 |
)) |
|
414 |
if ( |
|
415 | 2x |
length(existing_cols) != ncol(final_res) || |
416 | 2x |
!all(existing_cols %in% colnames(final_res)) |
417 |
) { |
|
418 | ! |
if (verbose) |
419 | ! |
message(" clearing existing cache since columns did not align") |
420 | ! |
unlink(previous) |
421 |
} |
|
422 |
} |
|
423 | 2x |
bin_content <- dplyr::compute(arrow::open_dataset( |
424 | 2x |
bin_dir, |
425 | 2x |
format = cache_format |
426 |
)) |
|
427 | 2x |
su <- !duplicated(as.character(bin_content$text_hash)) |
428 | ! |
if (!all(su)) bin_content <- bin_content[su, ] |
429 | 2x |
writer <- if (cache_format == "parquet") arrow::write_parquet else |
430 | 2x |
arrow::write_feather |
431 | 2x |
all_rows <- nrow(bin_content) |
432 | 2x |
for (i in seq_len(ceiling(all_rows / 1e9))) { |
433 | 2x |
writer( |
434 | 2x |
bin_content[seq((i - 1) * 1e9 + 1, min(all_rows, i * 1e9)), ], |
435 | 2x |
paste0(bin_dir, "/part-", write_time, "-", i, ".", cache_format) |
436 |
) |
|
437 |
} |
|
438 | 2x |
unlink(files) |
439 |
} |
|
440 |
} |
|
441 |
} |
|
442 |
} |
|
443 | ||
444 | 44x |
if (!collect_results) { |
445 | ! |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")") |
446 | 1x |
return(invisible(NULL)) |
447 |
} |
|
448 | ||
449 |
# prepare final results |
|
450 | 43x |
if (verbose) |
451 | 2x |
message("preparing output (", round(proc.time()[[3]] - st, 4), ")") |
452 | 43x |
rownames(final_res) <- final_res$id |
453 | 43x |
rownames(data) <- data$id |
454 | 43x |
data$text_hash <- structure( |
455 | 43x |
final_res$text_hash, |
456 | 43x |
names = data[final_res$id, "text"] |
457 | 43x |
)[data$text] |
458 | 43x |
final_res <- cbind( |
459 | 43x |
data[, |
460 | 43x |
c(if (return_text) "text", if (res$provided_id) "id", "text_hash"), |
461 | 43x |
drop = FALSE |
462 |
], |
|
463 | 43x |
final_res[ |
464 | 43x |
structure(final_res$id, names = final_res$text_hash)[data$text_hash], |
465 | 43x |
!colnames(final_res) %in% c("id", "bin", "text_hash", "custom"), |
466 | 43x |
drop = FALSE |
467 |
] |
|
468 |
) |
|
469 | 43x |
row.names(final_res) <- NULL |
470 | 43x |
if (!is.null(output)) { |
471 | ! |
if (!grepl("\\.csv", output, TRUE)) output <- paste0(output, ".csv") |
472 | 3x |
if (compress && !grepl(".xz", output, fixed = TRUE)) |
473 | 1x |
output <- paste0(output, ".xz") |
474 | 1x |
if (grepl(".xz", output, fixed = TRUE)) compress <- TRUE |
475 | 3x |
if (verbose) |
476 | 1x |
message( |
477 | 1x |
"writing results to file: ", |
478 | 1x |
output, |
479 |
" (", |
|
480 | 1x |
round(proc.time()[[3]] - st, 4), |
481 |
")" |
|
482 |
) |
|
483 | 3x |
dir.create(dirname(output), FALSE, TRUE) |
484 | 2x |
if (overwrite) unlink(output) |
485 | 1x |
if (compress) output <- xzfile(output) |
486 | 3x |
arrow::write_csv_arrow(final_res, file = output) |
487 |
} |
|
488 | ||
489 | 43x |
if (is.character(frameworks) && frameworks[1] != "all") { |
490 | 6x |
if (verbose) |
491 | ! |
message("selecting frameworks (", round(proc.time()[[3]] - st, 4), ")") |
492 | 6x |
vars <- colnames(final_res) |
493 | 6x |
sel <- grepl( |
494 | 6x |
paste0("^(?:", paste(tolower(frameworks), collapse = "|"), ")"), |
495 | 6x |
vars |
496 |
) |
|
497 | 6x |
if (any(sel)) { |
498 |
if ( |
|
499 | 5x |
missing(framework_prefix) && |
500 | 5x |
(length(frameworks) == 1 && frameworks != "all") |
501 |
) |
|
502 | 3x |
framework_prefix <- FALSE |
503 | 5x |
sel <- unique(c("text", "id", "text_hash", names(api_args), vars[sel])) |
504 | 5x |
sel <- sel[sel %in% vars] |
505 | 5x |
final_res <- final_res[, sel] |
506 |
} else { |
|
507 | 1x |
warning( |
508 | 1x |
"frameworks did not match any columns -- returning all", |
509 | 1x |
call. = FALSE |
510 |
) |
|
511 |
} |
|
512 |
} |
|
513 | 43x |
if (as_list) { |
514 | 1x |
if (missing(framework_prefix)) framework_prefix <- FALSE |
515 | 1x |
inall <- c("text", "id", "text_hash", names(api_args)) |
516 | 1x |
cols <- colnames(final_res) |
517 | 1x |
inall <- inall[inall %in% cols] |
518 | 1x |
pre <- sub("\\..*$", "", cols) |
519 | 1x |
pre <- unique(pre[!pre %in% inall]) |
520 | 1x |
final_res <- lapply(structure(pre, names = pre), function(f) { |
521 | 9x |
res <- final_res[, c(inall, grep(paste0("^", f), cols, value = TRUE))] |
522 | 9x |
if (!framework_prefix) colnames(res) <- sub("^.+\\.", "", colnames(res)) |
523 | 9x |
res |
524 |
}) |
|
525 | 42x |
} else if (!framework_prefix) |
526 | 4x |
colnames(final_res) <- sub("^.+\\.", "", colnames(final_res)) |
527 | 2x |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")") |
528 | 43x |
invisible(final_res) |
529 |
} |
1 |
#' @rdname receptiviti |
|
2 |
#' @export |
|
3 | ||
4 |
receptiviti_status <- function( |
|
5 |
url = Sys.getenv("RECEPTIVITI_URL"), |
|
6 |
key = Sys.getenv("RECEPTIVITI_KEY"), |
|
7 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), |
|
8 |
verbose = TRUE, |
|
9 |
include_headers = FALSE |
|
10 |
) { |
|
11 | 59x |
params <- handle_request_params(url, key, secret) |
12 | 54x |
ping <- tryCatch( |
13 | 54x |
curl_fetch_memory(paste0(params$url, "/v1/ping"), params$handler), |
14 | 54x |
error = function(e) NULL |
15 |
) |
|
16 | 54x |
if (is.null(ping)) { |
17 | 2x |
if (verbose) message("Status: ERROR\nMessage: URL is unreachable") |
18 | 3x |
invisible(return()) |
19 |
} |
|
20 | 51x |
ping$content <- list(message = rawToChar(ping$content)) |
21 | 51x |
if (substr(ping$content, 1, 1) == "{") |
22 | 50x |
ping$content <- fromJSON(ping$content$message) |
23 | 51x |
ok <- ping$status_code == 200 && !length(ping$content$code) |
24 | 51x |
ping$status_message <- if (ok) { |
25 | 48x |
ping$content$pong |
26 |
} else { |
|
27 | 3x |
paste0( |
28 | 3x |
if (length(ping$content$code)) |
29 | 3x |
paste0(ping$status_code, " (", ping$content$code, "): "), |
30 | 3x |
if ( |
31 | 3x |
nchar(ping$content$message) > 500 || |
32 | 3x |
grepl("<", ping$content$message, fixed = TRUE) |
33 |
) { |
|
34 | 1x |
ping$status_code |
35 |
} else { |
|
36 | 2x |
ping$content$message |
37 |
} |
|
38 |
) |
|
39 |
} |
|
40 | 51x |
if (verbose) { |
41 | 3x |
message( |
42 | 3x |
"Status: ", |
43 | 3x |
if (ok) "OK" else "ERROR", |
44 | 3x |
"\nMessage: ", |
45 | 3x |
ping$status_message |
46 |
) |
|
47 | 3x |
if (include_headers) { |
48 | 1x |
ping$headers <- strsplit( |
49 | 1x |
rawToChar(ping$headers), |
50 | 1x |
"[\r\n]+", |
51 | 1x |
perl = TRUE |
52 | 1x |
)[[1]] |
53 | 1x |
json <- regexec("\\{.+\\}", ping$headers) |
54 | 1x |
for (i in seq_along(json)) { |
55 | 10x |
if (json[[i]] != -1) { |
56 | 2x |
regmatches(ping$headers[[i]], json[[i]]) <- paste( |
57 |
" ", |
|
58 | 2x |
strsplit( |
59 | 2x |
toJSON( |
60 | 2x |
fromJSON(regmatches(ping$headers[[i]], json[[i]])), |
61 | 2x |
auto_unbox = TRUE, |
62 | 2x |
pretty = TRUE |
63 |
), |
|
64 | 2x |
"\n" |
65 | 2x |
)[[1]], |
66 | 2x |
collapse = "\n" |
67 |
) |
|
68 |
} |
|
69 |
} |
|
70 | 1x |
message(paste0("\n", paste(" ", ping$headers, collapse = "\n"))) |
71 |
} |
|
72 |
} |
|
73 | 51x |
invisible(ping) |
74 |
} |
|
75 | ||
76 |
handle_request_params <- function(url, key, secret) { |
|
77 | 74x |
if (key == "") { |
78 | 2x |
stop( |
79 | 2x |
"specify your key, or set it to the RECEPTIVITI_KEY environment variable", |
80 | 2x |
call. = FALSE |
81 |
) |
|
82 |
} |
|
83 | 72x |
if (secret == "") { |
84 | 2x |
stop( |
85 | 2x |
"specify your secret, or set it to the RECEPTIVITI_SECRET environment variable", |
86 | 2x |
call. = FALSE |
87 |
) |
|
88 |
} |
|
89 | 70x |
url <- paste0( |
90 | 70x |
if (!grepl("http", tolower(url), fixed = TRUE)) "https://", |
91 | 70x |
sub("/+[Vv]\\d+(?:/.*)?$|/+$", "", url) |
92 |
) |
|
93 | 70x |
if (!grepl("^https?://[^.]+[.:][^.]", url, TRUE)) { |
94 | 2x |
stop( |
95 | 2x |
"url does not appear to be valid: ", |
96 | 2x |
url, |
97 | 2x |
call. = FALSE |
98 |
) |
|
99 |
} |
|
100 | 68x |
list( |
101 | 68x |
url = url, |
102 | 68x |
handler = new_handle(httpauth = 1, userpwd = paste0(key, ":", secret)) |
103 |
) |
|
104 |
} |
1 |
#' List Available Frameworks |
|
2 |
#' |
|
3 |
#' Retrieve the list of frameworks available to your account. |
|
4 |
#' @param url,key,secret Request arguments; same as those in \code{\link{receptiviti}}. |
|
5 |
#' @returns A character vector containing the names of frameworks available to your account. |
|
6 |
#' @examples |
|
7 |
#' \dontrun{ |
|
8 |
#' |
|
9 |
#' # see which frameworks are available to your account |
|
10 |
#' frameworks <- receptiviti_frameworks() |
|
11 |
#' } |
|
12 |
#' @export |
|
13 | ||
14 |
receptiviti_frameworks <- function( |
|
15 |
url = Sys.getenv("RECEPTIVITI_URL"), |
|
16 |
key = Sys.getenv("RECEPTIVITI_KEY"), |
|
17 |
secret = Sys.getenv("RECEPTIVITI_SECRET") |
|
18 |
) { |
|
19 | 3x |
params <- handle_request_params(url, key, secret) |
20 | 2x |
req <- curl::curl_fetch_memory( |
21 | 2x |
paste0(params$url, "/v2/frameworks"), |
22 | 2x |
params$handler |
23 |
) |
|
24 | 2x |
if (req$status_code == 200) { |
25 | 2x |
return(jsonlite::fromJSON(rawToChar(req$content))) |
26 |
} |
|
27 | ! |
content <- list(message = rawToChar(req$content)) |
28 | ! |
if (substr(content$message, 1, 1) == "{") |
29 | ! |
content <- jsonlite::fromJSON(content$message) |
30 | ! |
stop("failed to retrieve frameworks list: ", content$message, call. = FALSE) |
31 |
} |