1 |
manage_request <- function(text = NULL, id = NULL, text_column = NULL, id_column = NULL, files = NULL, dir = NULL, |
|
2 |
file_type = "txt", encoding = NULL, context = "written", |
|
3 |
api_args = getOption("receptiviti.api_args", list()), |
|
4 |
bundle_size = 1000, bundle_byte_limit = 75e5, collapse_lines = FALSE, retry_limit = 50, |
|
5 |
clear_scratch_cache = TRUE, request_cache = TRUE, cores = detectCores() - 1, collect_results = TRUE, |
|
6 |
use_future = FALSE, in_memory = TRUE, verbose = FALSE, make_request = TRUE, |
|
7 |
text_as_paths = FALSE, cache = Sys.getenv("RECEPTIVITI_CACHE"), cache_overwrite = FALSE, |
|
8 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"), key = Sys.getenv("RECEPTIVITI_KEY"), |
|
9 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), url = Sys.getenv("RECEPTIVITI_URL"), |
|
10 |
version = Sys.getenv("RECEPTIVITI_VERSION"), endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT"), to_norming = FALSE) { |
|
11 |
# check input |
|
12 | 73x |
if (use_future && !requireNamespace("future.apply", quietly = TRUE)) { |
13 | ! |
stop("install the `future.apply` package to use future", call. = FALSE) |
14 |
} |
|
15 | 73x |
st <- proc.time()[[3]] |
16 | 73x |
text_as_dir <- FALSE |
17 | 73x |
if (is.null(text)) { |
18 | 3x |
if (!is.null(dir)) { |
19 | ! |
if (!dir.exists(dir)) stop("entered dir does not exist", call. = FALSE) |
20 | 1x |
text <- dir |
21 | 1x |
text_as_dir <- TRUE |
22 | 2x |
} else if (!is.null(files)) { |
23 | 1x |
text <- files |
24 | 1x |
text_as_paths <- TRUE |
25 |
} else { |
|
26 | 1x |
stop("enter text as the first argument, or use files or dir", call. = FALSE) |
27 |
} |
|
28 |
} |
|
29 | 72x |
if (text_as_paths) { |
30 | 1x |
if (anyNA(text)) stop("NAs are not allowed in text when being treated as file paths", call. = FALSE) |
31 | 1x |
if (!all(file.exists(text))) stop("not all of the files in text exist", call. = FALSE) |
32 |
} |
|
33 | 70x |
read_in <- FALSE |
34 | 70x |
handle_encoding <- function(file) { |
35 | 212x |
if (is.null(encoding)) { |
36 | 210x |
con <- gzfile(file, "rb") |
37 | 210x |
on.exit(close(con)) |
38 | 210x |
unlist(stringi::stri_enc_detect(readBin(con, "raw", file.size(file)))[[1]])[[1]] |
39 |
} else { |
|
40 | 2x |
encoding |
41 |
} |
|
42 |
} |
|
43 | 70x |
if (text_as_dir || text_as_paths || (is.character(text) && !anyNA(text) && all(nchar(text) < 500))) { |
44 | 52x |
if (text_as_dir || length(text) == 1 && dir.exists(text)) { |
45 | 1x |
if (verbose) message("reading in texts from directory: ", text, " (", round(proc.time()[[3]] - st, 4), ")") |
46 | 5x |
text_as_paths <- TRUE |
47 | 5x |
text <- normalizePath(list.files(text, file_type, full.names = TRUE), "/", FALSE) |
48 |
} |
|
49 | 52x |
if (text_as_paths || all(file.exists(text))) { |
50 | 15x |
text_as_paths <- collapse_lines |
51 | 15x |
if (!collapse_lines) { |
52 | 1x |
if (verbose) message("reading in texts from file list (", round(proc.time()[[3]] - st, 4), ")") |
53 | 12x |
if (is.null(id_column)) names(text) <- if (length(id) != length(text)) text else id |
54 | 13x |
if (all(grepl("\\.csv", text, TRUE))) { |
55 | 2x |
if (is.null(text_column)) stop("text appears to point to csv files, but text_column was not specified", call. = FALSE) |
56 | 4x |
read_in <- TRUE |
57 | 4x |
text <- unlist(lapply(text, function(f) { |
58 | 4x |
d <- tryCatch( |
59 |
{ |
|
60 | 4x |
enc <- handle_encoding(f) |
61 | 4x |
con <- gzfile(f, encoding = enc) |
62 | 4x |
arrow::read_csv_arrow(con, read_options = arrow::CsvReadOptions$create( |
63 | 4x |
encoding = enc |
64 | 4x |
), col_select = c(text_column, id_column)) |
65 |
}, |
|
66 | 4x |
error = function(e) NULL |
67 |
) |
|
68 | 1x |
if (is.null(d)) stop("failed to read in file ", f, call. = FALSE) |
69 | 3x |
if (!is.null(id_column) && id_column %in% colnames(d)) { |
70 | 1x |
structure(d[, text_column, drop = TRUE], names = d[, id_column, drop = TRUE]) |
71 |
} else { |
|
72 | 2x |
d[, text_column, drop = TRUE] |
73 |
} |
|
74 |
})) |
|
75 |
} else { |
|
76 | 7x |
text <- unlist(lapply(text, function(f) { |
77 | 156x |
tryCatch( |
78 |
{ |
|
79 | 156x |
con <- gzfile(f, encoding = handle_encoding(f)) |
80 | 156x |
on.exit(close(con)) |
81 | 156x |
d <- readLines(con, warn = FALSE, skipNul = TRUE) |
82 | 156x |
d[d != ""] |
83 |
}, |
|
84 | 156x |
error = function(e) stop("failed to read in file ", f, call. = FALSE) |
85 |
) |
|
86 |
})) |
|
87 |
} |
|
88 | 10x |
id <- names(text) |
89 |
} |
|
90 | 37x |
} else if (length(text) == 1 && dirname(text) != "." && dir.exists(dirname(dirname(text)))) { |
91 | ! |
stop("text appears to be a directory, but it does not exist") |
92 |
} |
|
93 | 49x |
if (text_as_paths && is.null(id)) { |
94 | 2x |
id <- text |
95 | ! |
if (anyDuplicated(id)) id <- names(unlist(lapply(split(id, factor(id, unique(id))), seq_along))) |
96 |
} |
|
97 |
} |
|
98 | 67x |
if (is.null(dim(text))) { |
99 | 61x |
if (!read_in) { |
100 | 1x |
if (!text_as_paths && !is.null(text_column)) stop("text_column is specified, but text has no columns", call. = FALSE) |
101 | 1x |
if (!is.null(id_column)) stop("id_column is specified, but text has no columns", call. = FALSE) |
102 |
} |
|
103 |
} else { |
|
104 | 1x |
if (length(id) == 1 && id %in% colnames(text)) id_column <- id |
105 | 6x |
if (!is.null(id_column)) { |
106 | 2x |
if (id_column %in% colnames(text)) { |
107 | 1x |
id <- text[, id_column, drop = TRUE] |
108 |
} else { |
|
109 | 1x |
stop("id_column not found in text", call. = FALSE) |
110 |
} |
|
111 |
} |
|
112 | 5x |
if (!is.null(text_column)) { |
113 | 3x |
if (text_column %in% colnames(text)) { |
114 | 2x |
text <- text[, text_column, drop = TRUE] |
115 |
} else { |
|
116 | 1x |
if (!text_as_paths) stop("text_column not found in text", call. = FALSE) |
117 |
} |
|
118 |
} |
|
119 | 4x |
if (!is.null(dim(text))) { |
120 | 2x |
if (ncol(text) == 1) { |
121 | 1x |
text <- text[, 1, drop = TRUE] |
122 |
} else { |
|
123 | 1x |
stop("text has dimensions, but no text_column column", call. = FALSE) |
124 |
} |
|
125 |
} |
|
126 |
} |
|
127 | 2x |
if (!is.character(text)) text <- as.character(text) |
128 | 1x |
if (!length(text)) stop("no texts were found after resolving the text argument") |
129 | 4x |
if (length(id) && !is.character(id)) id <- as.character(id) |
130 | 61x |
provided_id <- FALSE |
131 | 61x |
if (length(id)) { |
132 | 1x |
if (length(id) != length(text)) stop("id is not the same length as text", call. = FALSE) |
133 | 1x |
if (anyDuplicated(id)) stop("id contains duplicate values", call. = FALSE) |
134 | 14x |
provided_id <- TRUE |
135 |
} else { |
|
136 | 45x |
id <- paste0("t", seq_along(text)) |
137 |
} |
|
138 | 1x |
if (!is.numeric(retry_limit)) retry_limit <- 0 |
139 | 59x |
if (to_norming) { |
140 | 2x |
version <- "v2" |
141 | 2x |
endpoint <- "norming" |
142 | 2x |
full_url <- url |
143 | 2x |
request_cache <- FALSE |
144 |
} else { |
|
145 | 57x |
url_parts <- unlist(strsplit(regmatches( |
146 | 57x |
url, gregexpr("/[Vv]\\d+(?:/[^/]+)?", url) |
147 | 57x |
)[[1]], "/", fixed = TRUE)) |
148 | 52x |
if (version == "") version <- if (length(url_parts) > 1) url_parts[[2]] else "v1" |
149 | 57x |
version <- tolower(version) |
150 | 57x |
if (version == "" || !grepl("^v\\d+$", version)) { |
151 | 1x |
stop("invalid version: ", version, call. = FALSE) |
152 |
} |
|
153 | 56x |
if (endpoint == "") { |
154 | 55x |
endpoint <- if (length(url_parts) > 2) { |
155 | 1x |
url_parts[[3]] |
156 |
} else { |
|
157 | 4x |
if (tolower(version) == "v1") "framework" else "analyze" |
158 |
} |
|
159 |
} |
|
160 | 56x |
endpoint <- sub("^.*/", "", tolower(endpoint)) |
161 | 56x |
if (endpoint == "" || grepl("[^a-z]", endpoint)) { |
162 | 1x |
stop("invalid endpoint: ", endpoint, call. = FALSE) |
163 |
} |
|
164 | 55x |
url <- paste0(sub("/+[Vv]\\d+(/.*)?$|/+$", "", url), "/", version, "/") |
165 | 55x |
full_url <- paste0(url, endpoint, if (version == "v1") "/bulk" else paste0("/", context)) |
166 | ! |
if (!is.list(api_args)) api_args <- as.list(api_args) |
167 | 55x |
if (version != "v1" && "context" %in% api_args && "custom_context" %in% api_args) { |
168 | ! |
stop("only one of `context` or `custom_context may be specified", call. = FALSE) |
169 |
} |
|
170 | 55x |
if (version != "v1" && length(api_args)) { |
171 | ! |
full_url <- paste0( |
172 | ! |
full_url, "?", paste0(names(api_args), "=", unlist(api_args), collapse = "&") |
173 |
) |
|
174 |
} |
|
175 |
} |
|
176 | 57x |
args_hash <- digest::digest(jsonlite::toJSON(c( |
177 | 57x |
api_args, |
178 | 57x |
url = full_url, key = key, secret = secret |
179 | 57x |
), auto_unbox = TRUE), serialize = FALSE) |
180 | ||
181 |
# ping API |
|
182 | 57x |
if (make_request) { |
183 | 5x |
if (verbose) message("pinging API (", round(proc.time()[[3]] - st, 4), ")") |
184 | 54x |
ping <- receptiviti_status(url, key, secret, verbose = FALSE) |
185 | 1x |
if (is.null(ping)) stop("URL is unreachable", call. = FALSE) |
186 | 1x |
if (ping$status_code != 200) stop(ping$status_message, call. = FALSE) |
187 |
} |
|
188 | ||
189 |
# prepare text |
|
190 | 5x |
if (verbose) message("preparing text (", round(proc.time()[[3]] - st, 4), ")") |
191 | 53x |
data <- data.frame(text = text, id = id, stringsAsFactors = FALSE) |
192 | 53x |
text <- data[!is.na(data$text) & data$text != "" & !duplicated(data$text), ] |
193 | 1x |
if (!nrow(text)) stop("no valid texts to process", call. = FALSE) |
194 | 1x |
if (!is.numeric(bundle_size)) bundle_size <- 1000 |
195 | 52x |
n_texts <- nrow(text) |
196 | 52x |
n <- ceiling(n_texts / min(1000, max(1, bundle_size))) |
197 | 52x |
bundles <- split(text, sort(rep_len(seq_len(n), nrow(text)))) |
198 | 52x |
size_fun <- if (text_as_paths) function(b) sum(file.size(b$text)) else object.size |
199 | 52x |
for (i in rev(seq_along(bundles))) { |
200 | 163x |
size <- size_fun(bundles[[i]]) |
201 | 163x |
if (size > bundle_byte_limit) { |
202 | 2x |
sizes <- vapply(seq_len(nrow(bundles[[i]])), function(r) as.numeric(size_fun(bundles[[i]][r, ])), 0) |
203 | 2x |
if (any(sizes > bundle_byte_limit)) { |
204 | 1x |
stop( |
205 | 1x |
"one of your texts is over the individual size limit (", bundle_byte_limit / 1024e3, " MB)", |
206 | 1x |
call. = FALSE |
207 |
) |
|
208 |
} |
|
209 | 1x |
bins <- rep(1, length(sizes)) |
210 | 1x |
bin_size <- 0 |
211 | 1x |
bi <- 1 |
212 | 1x |
for (ti in seq_along(bins)) { |
213 | 50x |
bin_size <- bin_size + sizes[ti] |
214 | 50x |
if (bin_size > bundle_byte_limit) { |
215 | 7x |
bin_size <- sizes[ti] |
216 | 7x |
bi <- bi + 1 |
217 |
} |
|
218 | 50x |
bins[ti] <- bi |
219 |
} |
|
220 | 1x |
bundles <- c(bundles[-i], unname(split(bundles[[i]], paste0(i, ".", bins)))) |
221 |
} |
|
222 |
} |
|
223 | 51x |
n_bundles <- length(bundles) |
224 | 51x |
bundle_ref <- if (n_bundles == 1) "bundle" else "bundles" |
225 | 5x |
if (verbose) message("prepared text in ", n_bundles, " ", bundle_ref, " (", round(proc.time()[[3]] - st, 4), ")") |
226 | ||
227 | 51x |
auth <- paste0(key, ":", secret) |
228 | ! |
if (is.null(in_memory) && (use_future || cores > 1) && n_bundles > cores) in_memory <- FALSE |
229 | 51x |
request_scratch <- NULL |
230 | 51x |
if (!in_memory) { |
231 | ! |
if (verbose) message("writing ", bundle_ref, " to disc (", round(proc.time()[[3]] - st, 4), ")") |
232 | 2x |
request_scratch <- paste0(tempdir(), "/receptiviti_request_scratch/") |
233 | 2x |
dir.create(request_scratch, FALSE) |
234 | 2x |
if (clear_scratch_cache) on.exit(unlink(request_scratch, recursive = TRUE)) |
235 | 2x |
bundles <- vapply(bundles, function(b) { |
236 | 6x |
scratch_bundle <- paste0(request_scratch, digest::digest(b), ".rds") |
237 | 6x |
if (!file.exists(scratch_bundle)) saveRDS(b, scratch_bundle, compress = FALSE) |
238 | 6x |
scratch_bundle |
239 | 2x |
}, "", USE.NAMES = FALSE) |
240 |
} |
|
241 | ||
242 | 51x |
doprocess <- function(bundles, cores, future) { |
243 | 8x |
env <- parent.frame() |
244 | 8x |
if (future) { |
245 | 1x |
eval(expression(future.apply::future_lapply(bundles, process)), envir = env) |
246 |
} else { |
|
247 | 7x |
cl <- parallel::makeCluster(cores) |
248 | 7x |
parallel::clusterExport(cl, ls(envir = env), env) |
249 | 7x |
on.exit(parallel::stopCluster(cl)) |
250 | 7x |
(if (length(bundles) > cores * 2) parallel::parLapplyLB else parallel::parLapply)(cl, bundles, process) |
251 |
} |
|
252 |
} |
|
253 | ||
254 | 51x |
request <- function(body, body_hash, bin, ids, attempt = retry_limit) { |
255 | 644x |
temp_file <- paste0(tempdir(), "/", body_hash, ".json") |
256 | 616x |
if (!request_cache) unlink(temp_file) |
257 | 644x |
res <- NULL |
258 | 644x |
if (!file.exists(temp_file)) { |
259 | 638x |
if (make_request) { |
260 | 637x |
handler <- tryCatch( |
261 | 637x |
curl::new_handle(httpauth = 1, userpwd = auth, copypostfields = body), |
262 | 637x |
error = function(e) e$message |
263 |
) |
|
264 | 637x |
if (is.character(handler)) { |
265 | ! |
stop(if (grepl("libcurl", handler, fixed = TRUE)) { |
266 | ! |
"libcurl encountered an error; try setting the bundle_byte_limit argument to a smaller value" |
267 |
} else { |
|
268 | ! |
paste("failed to create handler:", handler) |
269 | ! |
}, call. = FALSE) |
270 |
} |
|
271 | 2x |
if (to_norming) curl::handle_setopt(handler, customrequest = "PATCH") |
272 | 637x |
res <- curl::curl_fetch_disk(full_url, temp_file, handler) |
273 |
} else { |
|
274 | 1x |
stop("make_request is FALSE, but there are texts with no cached results", call. = FALSE) |
275 |
} |
|
276 |
} |
|
277 | 643x |
result <- if (file.exists(temp_file)) { |
278 | 643x |
if (is.null(res$type) || grepl("application/json", res$type, fixed = TRUE)) { |
279 | 643x |
tryCatch( |
280 | 643x |
jsonlite::read_json(temp_file, simplifyVector = TRUE), |
281 | 643x |
error = function(e) list(message = "invalid response format") |
282 |
) |
|
283 |
} else { |
|
284 | ! |
list(message = "invalid response format") |
285 |
} |
|
286 |
} else { |
|
287 | ! |
list(message = rawToChar(res$content)) |
288 |
} |
|
289 | 643x |
valid_result <- if (to_norming) { |
290 | 2x |
!is.null(result$submitted) |
291 |
} else { |
|
292 | 641x |
!is.null(result$results) || is.null(result$message) |
293 |
} |
|
294 | 643x |
if (valid_result) { |
295 | 92x |
if (!is.null(result$results)) result <- result$results |
296 | 94x |
if ("error" %in% names(result)) { |
297 | 1x |
if (!is.list(result$error)) { |
298 | ! |
warning("bundle ", body_hash, " failed: ", result$error) |
299 |
} else { |
|
300 | 1x |
su <- !is.na(result$error$code) |
301 | 1x |
errors <- result[su & !duplicated(result$error$code), "error"] |
302 | 1x |
warning( |
303 | 1x |
if (sum(su) > 1) "some texts were invalid: " else "a text was invalid: ", |
304 | 1x |
paste( |
305 | 1x |
do.call(paste0, data.frame("(", errors$code, ") ", errors$message, stringsAsFactors = FALSE)), |
306 | 1x |
collapse = "; " |
307 |
), |
|
308 | 1x |
call. = FALSE |
309 |
) |
|
310 |
} |
|
311 |
} |
|
312 | 94x |
if (to_norming) { |
313 | 2x |
cbind(body_hash = body_hash, as.data.frame(result)) |
314 |
} else { |
|
315 | 92x |
unpack <- function(d) { |
316 | 31438x |
if (is.list(d)) as.data.frame(lapply(d, unpack), optional = TRUE) else d |
317 |
} |
|
318 | 92x |
result <- unpack(result[!names(result) %in% c("response_id", "language", "version", "error")]) |
319 | 92x |
if (!is.null(result) && nrow(result)) { |
320 | 92x |
if (colnames(result)[[1]] == "request_id") { |
321 | 92x |
colnames(result)[[1]] <- "text_hash" |
322 |
} |
|
323 | 92x |
cbind(id = ids, bin = bin, result) |
324 |
} |
|
325 |
} |
|
326 |
} else { |
|
327 | 549x |
unlink(temp_file) |
328 | 549x |
if (length(result$message) == 1 && substr(result$message, 1, 1) == "{") { |
329 | ! |
result <- jsonlite::fromJSON(result$message) |
330 |
} |
|
331 | 549x |
if (attempt > 0 && (length(result$code) == 1 && result$code == 1420) || ( |
332 | 549x |
length(result$message) == 1 && result$message == "invalid response format" |
333 |
)) { |
|
334 | 524x |
wait_time <- as.numeric(regmatches(result$message, regexec("[0-9]+(?:\\.[0-9]+)?", result$message))) |
335 | 524x |
Sys.sleep(if (is.na(wait_time)) 1 else wait_time / 1e3) |
336 | 524x |
request(body, body_hash, bin, ids, attempt - 1) |
337 |
} else { |
|
338 | 25x |
message <- if (is.null(res$status_code)) 200 else res$status_code |
339 | 25x |
if (length(result$code)) message <- paste0(message, " (", result$code, "): ", result$message) |
340 | ! |
if (length(result$error)) message <- paste0(message, ": ", result$error) |
341 | 25x |
stop(message, call. = FALSE) |
342 |
} |
|
343 |
} |
|
344 |
} |
|
345 | ||
346 | 51x |
process <- function(bundle) { |
347 | 158x |
opts <- getOption("stringsAsFactors") |
348 | 158x |
options("stringsAsFactors" = FALSE) |
349 | 158x |
on.exit(options("stringsAsFactors" = opts)) |
350 | 6x |
if (is.character(bundle)) bundle <- readRDS(bundle) |
351 | 158x |
text <- bundle$text |
352 | 158x |
bin <- NULL |
353 | 158x |
if (text_as_paths) { |
354 | 4x |
if (all(grepl("\\.csv", text, TRUE))) { |
355 | ! |
if (is.null(text_column)) stop("files appear to be csv, but no text_column was specified", call. = FALSE) |
356 | 3x |
text <- vapply(text, function(f) { |
357 | 51x |
tryCatch( |
358 | 51x |
paste(arrow::read_csv_arrow(f, read_options = arrow::CsvReadOptions$create( |
359 | 51x |
encoding = handle_encoding(f) |
360 | 51x |
), col_select = dplyr::all_of(text_column))[[1]], collapse = " "), |
361 | 51x |
error = function(e) stop("failed to read in file ", f, call. = FALSE) |
362 |
) |
|
363 |
}, "") |
|
364 |
} else { |
|
365 | 1x |
text <- vapply(text, function(f) { |
366 | 1x |
tryCatch( |
367 |
{ |
|
368 | 1x |
con <- file(f, encoding = handle_encoding(f)) |
369 | 1x |
on.exit(close(con)) |
370 | 1x |
paste(readLines(con, warn = FALSE, skipNul = TRUE), collapse = " ") |
371 |
}, |
|
372 | 1x |
error = function(e) stop("failed to read in file ", f, call. = FALSE) |
373 |
) |
|
374 |
}, "") |
|
375 |
} |
|
376 |
} |
|
377 | 158x |
bundle$hashes <- paste0(vapply(paste0(args_hash, text), digest::digest, "", serialize = FALSE)) |
378 | 158x |
if (to_norming) { |
379 | 2x |
body <- jsonlite::toJSON(lapply( |
380 | 2x |
seq_along(text), function(i) list(text = text[[i]], request_id = bundle$hashes[[i]]) |
381 | 2x |
), auto_unbox = TRUE) |
382 | 2x |
res <- request(body, digest::digest(body, serialize = FALSE), initial, bundle$id) |
383 | 2x |
prog(amount = nrow(bundle)) |
384 |
} else { |
|
385 | 156x |
initial <- paste0("h", substr(bundle$hashes, 1, 1)) |
386 | 156x |
set <- !is.na(text) & text != "" & text != "logical(0)" & !duplicated(bundle$hashes) |
387 | 156x |
res_cached <- cached_cols <- res_fresh <- NULL |
388 | 156x |
nres <- ncached <- 0 |
389 | 156x |
check_cache <- !cache_overwrite && (cache != "" && length(list.dirs(cache))) |
390 | 156x |
if (check_cache) { |
391 | 42x |
db <- arrow::open_dataset( |
392 | 42x |
cache, |
393 | 42x |
partitioning = arrow::schema(bin = arrow::string()), format = cache_format |
394 |
) |
|
395 | 42x |
cached_cols <- colnames(db) |
396 | 42x |
cached <- if (!is.null(db$schema$GetFieldByName("text_hash"))) { |
397 | 40x |
text_hash <- NULL |
398 | 40x |
su <- dplyr::filter(db, bin %in% unique(initial), text_hash %in% bundle$hashes) |
399 | 40x |
tryCatch( |
400 | 40x |
dplyr::compute(if (collect_results) su else dplyr::select(su, text_hash)), |
401 | 40x |
error = function(e) matrix(integer(), 0) |
402 |
) |
|
403 |
} else { |
|
404 | 2x |
matrix(integer(), 0) |
405 |
} |
|
406 | 42x |
ncached <- nrow(cached) |
407 | 42x |
if (ncached) { |
408 | 38x |
cached <- as.data.frame(cached$to_data_frame()) |
409 | ! |
if (anyDuplicated(cached$text_hash)) cached <- cached[!duplicated(cached$text_hash), ] |
410 | 38x |
rownames(cached) <- cached$text_hash |
411 | 38x |
cached_set <- which(bundle$hashes %in% cached$text_hash) |
412 | 38x |
set[cached_set] <- FALSE |
413 | 38x |
if (collect_results) { |
414 | 38x |
res_cached <- cbind( |
415 | 38x |
id = bundle$id[cached_set], cached[bundle$hashes[cached_set], ] |
416 |
) |
|
417 |
} |
|
418 |
} |
|
419 |
} |
|
420 | 156x |
valid_options <- names(api_args) |
421 | 156x |
if (any(set)) { |
422 | 118x |
set <- which(set) |
423 | 118x |
make_bundle <- if (version == "v1") { |
424 | 113x |
function(i) { |
425 | 253x |
c(api_args, list(content = text[[i]], request_id = bundle$hashes[[i]])) |
426 |
} |
|
427 |
} else { |
|
428 | 5x |
function(i) { |
429 | 5x |
list(text = text[[i]], request_id = bundle$hashes[[i]]) |
430 |
} |
|
431 |
} |
|
432 | 118x |
body <- jsonlite::toJSON(unname(lapply(set, make_bundle)), auto_unbox = TRUE) |
433 | 118x |
body_hash <- digest::digest(body, serialize = FALSE) |
434 | 118x |
res_fresh <- request(body, body_hash, initial[set], bundle$id[set]) |
435 | 92x |
valid_options <- valid_options[valid_options %in% colnames(res_fresh)] |
436 | 92x |
if (length(valid_options)) { |
437 | 2x |
res_fresh <- res_fresh[, !colnames(res_fresh) %in% valid_options, drop = FALSE] |
438 |
} |
|
439 | 92x |
if (ncached && !all(cached_cols %in% colnames(res_fresh))) { |
440 | ! |
res_cached <- NULL |
441 | ! |
ncached <- 0 |
442 | ! |
body <- jsonlite::toJSON(lapply(cached_set, make_bundle), auto_unbox = TRUE) |
443 | ! |
res_fresh <- rbind(res_fresh, request( |
444 | ! |
body, digest::digest(body, serialize = FALSE), |
445 | ! |
initial[cached_set], bundle$id[cached_set] |
446 |
)) |
|
447 |
} |
|
448 | 92x |
nres <- nrow(res_fresh) |
449 | 92x |
if (cache != "" && nres) { |
450 | 4x |
writer <- if (cache_format == "parquet") arrow::write_parquet else arrow::write_feather |
451 | 4x |
cols <- vapply( |
452 | 4x |
res_fresh[, !(colnames(res_fresh) %in% c("id", "bin", names(api_args)))], |
453 | 4x |
is.character, TRUE |
454 |
) |
|
455 | 4x |
schema <- list() |
456 | 4x |
for (v in names(cols)) { |
457 | 796x |
schema[[v]] <- if (cols[[v]]) { |
458 | 4x |
arrow::string() |
459 | 796x |
} else if (v %in% c("summary.word_count", "summary.sentence_count")) { |
460 | ! |
if (anyNA(res_fresh[[v]])) res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_integer_ |
461 | 8x |
arrow::int32() |
462 |
} else { |
|
463 | 4x |
if (anyNA(res_fresh[[v]])) res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_real_ |
464 | 784x |
arrow::float64() |
465 |
} |
|
466 |
} |
|
467 | 4x |
schema <- arrow::schema(schema) |
468 | 4x |
for (part_bin in unique(res_fresh$bin)) { |
469 | 17x |
part <- res_fresh[res_fresh$bin == part_bin, ] |
470 | 17x |
part$id <- NULL |
471 | 17x |
part$bin <- NULL |
472 | 17x |
bin_dir <- paste0(cache, "/bin=", part_bin, "/") |
473 | 17x |
dir.create(bin_dir, FALSE, TRUE) |
474 | 17x |
writer( |
475 | 17x |
arrow::as_arrow_table(part, schema = schema), |
476 | 17x |
paste0(bin_dir, "fragment-", body_hash, "-0.", cache_format) |
477 |
) |
|
478 |
} |
|
479 |
} |
|
480 |
} |
|
481 | 130x |
if (collect_results) { |
482 | 130x |
res <- rbind(res_cached, res_fresh) |
483 | 2x |
if (length(valid_options)) for (n in valid_options) res[[n]] <- api_args[[n]] |
484 | 130x |
missing_ids <- !bundle$id %in% res$id |
485 | 130x |
if (any(missing_ids)) { |
486 | 1x |
varnames <- colnames(res)[colnames(res) != "id"] |
487 | 1x |
res <- rbind(res, cbind( |
488 | 1x |
id = bundle$id[missing_ids], |
489 | 1x |
as.data.frame(matrix(NA, sum(missing_ids), length(varnames), dimnames = list(NULL, varnames))) |
490 |
)) |
|
491 | 1x |
res$text_hash <- structure(bundle$hashes, names = bundle$id)[res$id] |
492 |
} |
|
493 |
} |
|
494 | 130x |
prog(amount = nres + ncached) |
495 |
} |
|
496 | ! |
if (collect_results) res else NULL |
497 |
} |
|
498 | ||
499 |
# make request(s) |
|
500 | 51x |
cores <- if (is.numeric(cores)) max(1, min(n_bundles, cores)) else 1 |
501 | 51x |
prog <- progressor(n_texts) |
502 | 51x |
results <- if (use_future || cores > 1) { |
503 | 8x |
call_env <- new.env(parent = globalenv()) |
504 | 8x |
environment(doprocess) <- call_env |
505 | 8x |
environment(request) <- call_env |
506 | 8x |
environment(process) <- call_env |
507 | 8x |
for (name in c( |
508 | 8x |
"doprocess", "request", "process", "text_column", "prog", "make_request", "full_url", "cache", |
509 | 8x |
"cache_overwrite", "use_future", "cores", "bundles", "cache_format", "request_cache", "auth", |
510 | 8x |
"version", "to_norming", "text_as_paths", "retry_limit", "api_args", "args_hash", "encoding", |
511 | 8x |
"handle_encoding", "collect_results" |
512 |
)) { |
|
513 | 192x |
call_env[[name]] <- get(name) |
514 |
} |
|
515 | 8x |
if (verbose) { |
516 | 1x |
message( |
517 | 1x |
"processing ", bundle_ref, " using ", if (use_future) "future backend" else paste(cores, "cores"), |
518 | 1x |
" (", round(proc.time()[[3]] - st, 4), ")" |
519 |
) |
|
520 |
} |
|
521 | 8x |
eval(expression(doprocess(bundles, cores, use_future)), envir = call_env) |
522 |
} else { |
|
523 | 4x |
if (verbose) message("processing ", bundle_ref, " sequentially (", round(proc.time()[[3]] - st, 4), ")") |
524 | 43x |
lapply(bundles, process) |
525 |
} |
|
526 | 5x |
if (verbose) message("done retrieving (", round(proc.time()[[3]] - st, 4), ")") |
527 | 49x |
if (collect_results) { |
528 | 49x |
final_res <- do.call(rbind, results) |
529 | 49x |
list(data = data, final_res = final_res, provided_id = provided_id) |
530 |
} else { |
|
531 | ! |
NULL |
532 |
} |
|
533 |
} |
1 |
.onLoad <- function(lib, pkg) { |
|
2 | ! |
if (Sys.getenv("RECEPTIVITI_URL") == "") Sys.setenv(RECEPTIVITI_URL = "https://api.receptiviti.com/") |
3 |
} |
|
4 | ||
5 |
#' Receptiviti API |
|
6 |
#' |
|
7 |
#' The main function to access the \href{https://www.receptiviti.com}{Receptiviti} API. |
|
8 |
#' |
|
9 |
#' @param text A character vector with text to be processed, path to a directory containing files, or a vector of file paths. |
|
10 |
#' If a single path to a directory, each file is collapsed to a single text. If a path to a file or files, |
|
11 |
#' each line or row is treated as a separate text, unless \code{collapse_lines} is \code{TRUE} (in which case, |
|
12 |
#' files will be read in as part of bundles at processing time, as is always the case when a directory). |
|
13 |
#' Use \code{files} to more reliably enter files, or \code{dir} to more reliably specify a directory. |
|
14 |
#' @param output Path to a \code{.csv} file to write results to. If this already exists, set \code{overwrite} to \code{TRUE} |
|
15 |
#' to overwrite it. |
|
16 |
#' @param id Vector of unique IDs the same length as \code{text}, to be included in the results. |
|
17 |
#' @param text_column,id_column Column name of text/id, if \code{text} is a matrix-like object, or a path to a csv file. |
|
18 |
#' @param files A list of file paths, as alternate entry to \code{text}. |
|
19 |
#' @param dir A directory to search for files in, as alternate entry to \code{text}. |
|
20 |
#' @param file_type File extension to search for, if \code{text} is the path to a directory containing files to be read in. |
|
21 |
#' @param encoding Encoding of file(s) to be read in. If not specified, this will be detected, which can fail, |
|
22 |
#' resulting in mis-encoded characters; for best (and fasted) results, specify encoding. |
|
23 |
#' @param return_text Logical; if \code{TRUE}, \code{text} is included as the first column of the result. |
|
24 |
#' @param context Name of the analysis context. |
|
25 |
#' @param custom_context Name of a custom context (as listed by \code{\link{receptiviti_norming}}), |
|
26 |
#' or \code{TRUE} if \code{context} is the name of a custom context. |
|
27 |
#' @param api_args A list of additional arguments to pass to the API (e.g., \code{list(sallee_mode = "sparse")}). Defaults to the |
|
28 |
#' \code{receptiviti.api_args} option. Custom norming contexts can be established with the \code{\link{receptiviti_norming}} |
|
29 |
#' function, then referred to here with the \code{custom_context} argument (only available in API V2). |
|
30 |
#' @param frameworks A vector of frameworks to include results from. Texts are always scored with all available framework -- |
|
31 |
#' this just specifies what to return. Defaults to \code{all}, to return all scored frameworks. Can be set by the |
|
32 |
#' \code{receptiviti.frameworks} option (e.g., \code{options(receptiviti.frameworks = c("liwc", "sallee"))}). |
|
33 |
#' @param framework_prefix Logical; if \code{FALSE}, will remove the framework prefix from column names, which may result in duplicates. |
|
34 |
#' If this is not specified, and 1 framework is selected, or \code{as_list} is \code{TRUE}, will default to remove prefixes. |
|
35 |
#' @param as_list Logical; if \code{TRUE}, returns a list with frameworks in separate entries. |
|
36 |
#' @param bundle_size Number of texts to include in each request; between 1 and 1,000. |
|
37 |
#' @param bundle_byte_limit Memory limit (in bytes) of each bundle, under \code{1e7} (10 MB, which is the API's limit). |
|
38 |
#' May need to be lower than the API's limit, depending on the system's requesting library. |
|
39 |
#' @param collapse_lines Logical; if \code{TRUE}, and \code{text} contains paths to files, each file is treated as a single text. |
|
40 |
#' @param retry_limit Maximum number of times each request can be retried after hitting a rate limit. |
|
41 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{output} file. |
|
42 |
#' @param compress Logical; if \code{TRUE}, will save as an \code{xz}-compressed file. |
|
43 |
#' @param make_request Logical; if \code{FALSE}, a request is not made. This could be useful if you want to be sure and |
|
44 |
#' load from one of the caches, but aren't sure that all results exist there; it will error out if it encounters |
|
45 |
#' texts it has no other source for. |
|
46 |
#' @param text_as_paths Logical; if \code{TRUE}, ensures \code{text} is treated as a vector of file paths. Otherwise, this will be |
|
47 |
#' determined if there are no \code{NA}s in \code{text} and every entry is under 500 characters long. |
|
48 |
#' @param cache Path to a directory in which to save unique results for reuse; defaults to |
|
49 |
#' \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}. See the Cache section for details. |
|
50 |
#' @param cache_overwrite Logical; if \code{TRUE}, will write results to the cache without reading from it. This could be used |
|
51 |
#' if you want fresh results to be cached without clearing the cache. |
|
52 |
#' @param cache_format Format of the cache database; see \code{\link[arrow]{FileFormat}}. |
|
53 |
#' Defaults to \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE_FORMAT")}. |
|
54 |
#' @param clear_cache Logical; if \code{TRUE}, will clear any existing files in the cache. Use \code{cache_overwrite} if |
|
55 |
#' you want fresh results without clearing or disabling the cache. Use \code{cache = FALSE} to disable the cache. |
|
56 |
#' @param request_cache Logical; if \code{FALSE}, will always make a fresh request, rather than using the response |
|
57 |
#' from a previous identical request. |
|
58 |
#' @param cores Number of CPU cores to split bundles across, if there are multiple bundles. See the Parallelization section. |
|
59 |
#' @param collect_results Logical; if \code{FALSE}, will not retain bundle results in memory for return. |
|
60 |
#' @param use_future Logical; if \code{TRUE}, uses a \code{future} back-end to process bundles, in which case, |
|
61 |
#' parallelization can be controlled with the \code{\link[future]{plan}} function (e.g., \code{plan("multisession")} |
|
62 |
#' to use multiple cores); this is required to see progress bars when using multiple cores. See the Parallelization section. |
|
63 |
#' @param in_memory Logical; if \code{FALSE}, will write bundles to temporary files, and only load them as they are being requested. |
|
64 |
#' @param clear_scratch_cache Logical; if \code{FALSE}, will preserve the bundles written when \code{in_memory} is \code{TRUE}, after |
|
65 |
#' the request has been made. |
|
66 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
67 |
#' @param key API Key; defaults to \code{Sys.getenv("RECEPTIVITI_KEY")}. |
|
68 |
#' @param secret API Secret; defaults to \code{Sys.getenv("RECEPTIVITI_SECRET")}. |
|
69 |
#' @param url API URL; defaults to \code{Sys.getenv("RECEPTIVITI_URL")}, which defaults to |
|
70 |
#' \code{"https://api.receptiviti.com/"}. |
|
71 |
#' @param version API version; defaults to \code{Sys.getenv("RECEPTIVITI_VERSION")}, which defaults to |
|
72 |
#' \code{"v1"}. |
|
73 |
#' @param endpoint API endpoint (path name after the version); defaults to \code{Sys.getenv("RECEPTIVITI_ENDPOINT")}, |
|
74 |
#' which defaults to \code{"framework"}. |
|
75 |
#' @param include_headers Logical; if \code{TRUE}, \code{receptiviti_status}'s verbose message will include |
|
76 |
#' the HTTP headers. |
|
77 |
#' |
|
78 |
#' @returns Nothing if \code{collect_results} is \code{FALSE}. |
|
79 |
#' Otherwise, a \code{data.frame} with columns for \code{text} (if \code{return_text} is \code{TRUE}; the originally entered text), |
|
80 |
#' \code{id} (if one was provided), \code{text_hash} (the MD5 hash of the text), a column each for relevant entries in \code{api_args}, |
|
81 |
#' and scores from each included framework (e.g., \code{summary.word_count} and \code{liwc.i}). If \code{as_list} is \code{TRUE}, |
|
82 |
#' returns a list with a named entry containing such a \code{data.frame} for each framework. |
|
83 |
#' |
|
84 |
#' @section Request Process: |
|
85 |
#' This function (along with the internal \code{manage_request} function) handles texts and results in several steps: |
|
86 |
#' \enumerate{ |
|
87 |
#' \item Prepare bundles (split \code{text} into <= \code{bundle_size} and <= \code{bundle_byte_limit} bundles). |
|
88 |
#' \enumerate{ |
|
89 |
#' \item If \code{text} points to a directory or list of files, these will be read in later. |
|
90 |
#' \item If \code{in_memory} is \code{FALSE}, bundles are written to a temporary location, |
|
91 |
#' and read back in when the request is made. |
|
92 |
#' } |
|
93 |
#' \item Get scores for texts within each bundle. |
|
94 |
#' \enumerate{ |
|
95 |
#' \item If texts are paths, or \code{in_memory} is \code{FALSE}, will load texts. |
|
96 |
#' \item If \code{cache} is set, will skip any texts with cached scores. |
|
97 |
#' \item If \code{request_cache} is \code{TRUE}, will check for a cached request. |
|
98 |
#' \item If any texts need scoring and \code{make_request} is \code{TRUE}, will send unscored texts to the API. |
|
99 |
#' } |
|
100 |
#' \item If a request was made and \code{request_cache} is set, will cache the response. |
|
101 |
#' \item If \code{cache} is set, will write bundle scores to the cache. |
|
102 |
#' \item After requests are made, if \code{cache} is set, will defragment the cache |
|
103 |
#' (combine bundle results within partitions). |
|
104 |
#' \item If \code{collect_results} is \code{TRUE}, will prepare results: |
|
105 |
#' \enumerate{ |
|
106 |
#' \item Will realign results with \code{text} (and \code{id} if provided). |
|
107 |
#' \item If \code{output} is specified, will write realigned results to it. |
|
108 |
#' \item Will drop additional columns (such as \code{custom} and \code{id} if not provided). |
|
109 |
#' \item If \code{framework} is specified, will use it to select columns of the results. |
|
110 |
#' \item Returns results. |
|
111 |
#' } |
|
112 |
#' } |
|
113 |
#' |
|
114 |
#' @section Cache: |
|
115 |
#' If the \code{cache} argument is specified, results for unique texts are saved in an |
|
116 |
#' \href{https://arrow.apache.org}{Arrow} database in the cache location |
|
117 |
#' (\code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}), and are retrieved with subsequent requests. |
|
118 |
#' This ensures that the exact same texts are not re-sent to the API. |
|
119 |
#' This does, however, add some processing time and disc space usage. |
|
120 |
#' |
|
121 |
#' If \code{cache} is \code{TRUE}, a default directory (\code{receptiviti_cache}) will be looked for |
|
122 |
#' in the system's temporary directory (which is usually the parent of \code{tempdir()}). |
|
123 |
#' If this does not exist, you will be asked if it should be created. |
|
124 |
#' |
|
125 |
#' The primary cache is checked when each bundle is processed, and existing results are loaded at |
|
126 |
#' that time. When processing many bundles in parallel, and many results have been cached, |
|
127 |
#' this can cause the system to freeze and potentially crash. |
|
128 |
#' To avoid this, limit the number of cores, or disable parallel processing. |
|
129 |
#' |
|
130 |
#' The \code{cache_format} arguments (or the \code{RECEPTIVITI_CACHE_FORMAT} environment variable) can be used to adjust the format of the cache. |
|
131 |
#' |
|
132 |
#' You can use the cache independently with \code{open_database(Sys.getenv("RECEPTIVITI_CACHE"))}. |
|
133 |
#' |
|
134 |
#' You can also set the \code{clear_cache} argument to \code{TRUE} to clear the cache before it is used again, which may be useful |
|
135 |
#' if the cache has gotten big, or you know new results will be returned. Even if a cached result exists, it will be |
|
136 |
#' reprocessed if it does not have all of the variables of new results, but this depends on there being at least 1 uncached |
|
137 |
#' result. If, for instance, you add a framework to your account and want to reprocess a previously processed set of texts, |
|
138 |
#' you would need to first clear the cache. |
|
139 |
#' |
|
140 |
#' Either way, duplicated texts within the same call will only be sent once. |
|
141 |
#' |
|
142 |
#' The \code{request_cache} argument controls a more temporary cache of each bundle request. This is cleared when the |
|
143 |
#' R session ends. You might want to set this to \code{FALSE} if a new framework becomes available on your account |
|
144 |
#' and you want to process a set of text you already processed in the current R session without restarting. |
|
145 |
#' |
|
146 |
#' Another temporary cache is made when \code{in_memory} is \code{FALSE}, which is the default when processing |
|
147 |
#' in parallel (when \code{cores} is over \code{1} or \code{use_future} is \code{TRUE}). This contains |
|
148 |
#' a file for each unique bundle, which is read in as needed by the parallel workers. |
|
149 |
#' |
|
150 |
#' @section Parallelization: |
|
151 |
#' \code{text}s are split into bundles based on the \code{bundle_size} argument. Each bundle represents |
|
152 |
#' a single request to the API, which is why they are limited to 1000 texts and a total size of 10 MB. |
|
153 |
#' When there is more than one bundle and either \code{cores} is greater than 1 or \code{use_future} is \code{TRUE} (and you've |
|
154 |
#' externally specified a \code{\link[future]{plan}}), bundles are processed by multiple cores. |
|
155 |
#' |
|
156 |
#' If you have texts spread across multiple files, they can be most efficiently processed in parallel |
|
157 |
#' if each file contains a single text (potentially collapsed from multiple lines). If files contain |
|
158 |
#' multiple texts (i.e., \code{collapse_lines = FALSE}), then texts need to be read in before bundling |
|
159 |
#' in order to ensure bundles are under the length limit. |
|
160 |
#' |
|
161 |
#' Whether processing in serial or parallel, progress bars can be specified externally with |
|
162 |
#' \code{\link[progressr]{handlers}}; see examples. |
|
163 |
#' @examples |
|
164 |
#' \dontrun{ |
|
165 |
#' |
|
166 |
#' # check that the API is available, and your credentials work |
|
167 |
#' receptiviti_status() |
|
168 |
#' |
|
169 |
#' # score a single text |
|
170 |
#' single <- receptiviti("a text to score") |
|
171 |
#' |
|
172 |
#' # score multiple texts, and write results to a file |
|
173 |
#' multi <- receptiviti(c("first text to score", "second text"), "filename.csv") |
|
174 |
#' |
|
175 |
#' # score many texts in separate files |
|
176 |
#' ## defaults to look for .txt files |
|
177 |
#' file_results <- receptiviti(dir = "./path/to/txt_folder") |
|
178 |
#' |
|
179 |
#' ## could be .csv |
|
180 |
#' file_results <- receptiviti( |
|
181 |
#' dir = "./path/to/csv_folder", |
|
182 |
#' text_column = "text", file_type = "csv" |
|
183 |
#' ) |
|
184 |
#' |
|
185 |
#' # score many texts from a file, with a progress bar |
|
186 |
#' ## set up cores and progress bar |
|
187 |
#' ## (only necessary if you want the progress bar) |
|
188 |
#' future::plan("multisession") |
|
189 |
#' progressr::handlers(global = TRUE) |
|
190 |
#' progressr::handlers("progress") |
|
191 |
#' |
|
192 |
#' ## make request |
|
193 |
#' results <- receptiviti( |
|
194 |
#' "./path/to/largefile.csv", |
|
195 |
#' text_column = "text", use_future = TRUE |
|
196 |
#' ) |
|
197 |
#' } |
|
198 |
#' @importFrom curl new_handle curl_fetch_memory curl_fetch_disk handle_setopt |
|
199 |
#' @importFrom jsonlite toJSON fromJSON read_json |
|
200 |
#' @importFrom utils object.size |
|
201 |
#' @importFrom digest digest |
|
202 |
#' @importFrom parallel detectCores makeCluster clusterExport parLapplyLB parLapply stopCluster |
|
203 |
#' @importFrom progressr progressor |
|
204 |
#' @importFrom stringi stri_enc_detect |
|
205 |
#' @export |
|
206 | ||
207 |
receptiviti <- function(text = NULL, output = NULL, id = NULL, text_column = NULL, id_column = NULL, files = NULL, dir = NULL, |
|
208 |
file_type = "txt", encoding = NULL, return_text = FALSE, context = "written", custom_context = FALSE, |
|
209 |
api_args = getOption("receptiviti.api_args", list()), |
|
210 |
frameworks = getOption("receptiviti.frameworks", "all"), framework_prefix = TRUE, as_list = FALSE, |
|
211 |
bundle_size = 1000, bundle_byte_limit = 75e5, collapse_lines = FALSE, retry_limit = 50, clear_cache = FALSE, |
|
212 |
clear_scratch_cache = TRUE, request_cache = TRUE, cores = detectCores() - 1, collect_results = TRUE, |
|
213 |
use_future = FALSE, in_memory = TRUE, verbose = FALSE, overwrite = FALSE, compress = FALSE, make_request = TRUE, |
|
214 |
text_as_paths = FALSE, cache = Sys.getenv("RECEPTIVITI_CACHE"), cache_overwrite = FALSE, |
|
215 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"), key = Sys.getenv("RECEPTIVITI_KEY"), |
|
216 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), url = Sys.getenv("RECEPTIVITI_URL"), |
|
217 |
version = Sys.getenv("RECEPTIVITI_VERSION"), endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT")) { |
|
218 |
# check input |
|
219 | 75x |
if (!is.null(output)) { |
220 | ! |
if (!file.exists(output) && file.exists(paste0(output, ".xz"))) output <- paste0(output, ".xz") |
221 | 1x |
if (!overwrite && file.exists(output)) stop("output file already exists; use overwrite = TRUE to overwrite it", call. = FALSE) |
222 |
} |
|
223 | 74x |
if (isTRUE(cache)) { |
224 | 2x |
temp <- dirname(tempdir()) |
225 | ! |
if (basename(temp) == "working_dir") temp <- dirname(dirname(temp)) |
226 | 2x |
cache <- paste0(temp, "/receptiviti_cache") |
227 | 2x |
if (!dir.exists(cache)) { |
228 | 2x |
if (interactive() && !isFALSE(getOption("receptiviti.cache_prompt")) && |
229 | 2x |
grepl("^(?:[Yy1]|$)", readline("Do you want to establish a default cache? [Y/n] "))) { |
230 |
} else { |
|
231 | 2x |
options(receptiviti.cache_prompt = FALSE) |
232 | 2x |
cache <- "" |
233 |
} |
|
234 |
} |
|
235 |
} |
|
236 | ! |
if (!is.character(cache)) cache <- "" |
237 | 74x |
if (cache != "") { |
238 | 31x |
if (!requireNamespace("arrow", quietly = TRUE)) { |
239 | ! |
stop("install the `arrow` package to enable the cache", call. = FALSE) |
240 |
} |
|
241 | 31x |
if (!(cache_format %in% c("parquet", "feather"))) { |
242 | ! |
stop("cache format can only be `parquet` or `feather`", call. = FALSE) |
243 |
} |
|
244 | 2x |
if (clear_cache) unlink(cache, TRUE) |
245 | 31x |
dir.create(cache, FALSE, TRUE) |
246 | 31x |
cached_parts <- list.files(cache, cache_format, recursive = TRUE, full.names = TRUE) |
247 |
} |
|
248 | 74x |
st <- proc.time()[[3]] |
249 | 74x |
if (is.character(custom_context)) { |
250 | 3x |
context <- custom_context |
251 | 3x |
custom_context <- TRUE |
252 |
} |
|
253 | 74x |
if (context != "written") { |
254 | 4x |
norming_status <- receptiviti_norming( |
255 | 4x |
name_only = TRUE, url = url, key = key, secret = secret, verbose = FALSE |
256 |
) |
|
257 | 4x |
if (verbose) { |
258 | ! |
message( |
259 | ! |
"retrieved custom norming context list (", round(proc.time()[[3]] - st, 4), ")" |
260 |
) |
|
261 |
} |
|
262 | 4x |
context_id <- if (custom_context) paste0("custom/", context) else context |
263 | 4x |
if (!length(norming_status) || !(context_id %in% norming_status)) { |
264 | 2x |
stop( |
265 | 2x |
"custom norming context ", context, " is not on record or is not complete", |
266 | 2x |
call. = FALSE |
267 |
) |
|
268 |
} |
|
269 |
} |
|
270 | 72x |
if (length(frameworks) && !("all" %in% frameworks) && grepl("2", version, fixed = TRUE)) { |
271 | 1x |
api_args$frameworks <- paste0(frameworks[frameworks != "summary"], collapse = ",") |
272 |
} |
|
273 | 72x |
if (!is.null(api_args$frameworks)) { |
274 | 1x |
available_frameworks <- c("summary", receptiviti_frameworks(url, key, secret)) |
275 | ! |
if (verbose) message("retrived frameworks list (", round(proc.time()[[3]] - st, 4), ")") |
276 | 1x |
arg_frameworks <- unlist(strsplit(api_args$frameworks, ",", fixed = TRUE)) |
277 | 1x |
su <- !(arg_frameworks %in% available_frameworks) |
278 | 1x |
if (any(su)) { |
279 | 1x |
stop( |
280 | 1x |
"requested framework(s) are not available to your account: ", |
281 | 1x |
paste(arg_frameworks[su], collapse = ", "), |
282 | 1x |
call. = FALSE |
283 |
) |
|
284 |
} |
|
285 |
} |
|
286 | 71x |
res <- manage_request( |
287 | 71x |
text, |
288 | 71x |
id = id, text_column = text_column, id_column = id_column, files = files, dir = dir, |
289 | 71x |
file_type = file_type, encoding = encoding, |
290 | 71x |
context = if (custom_context) paste0("custom/", context) else context, |
291 | 71x |
api_args = api_args, bundle_size = bundle_size, |
292 | 71x |
bundle_byte_limit = bundle_byte_limit, collapse_lines = collapse_lines, |
293 | 71x |
retry_limit = retry_limit, clear_scratch_cache = clear_scratch_cache, |
294 | 71x |
request_cache = request_cache, cores = cores, use_future = use_future, in_memory = in_memory, |
295 | 71x |
verbose = verbose, make_request = make_request, text_as_paths = text_as_paths, cache = cache, |
296 | 71x |
cache_overwrite = cache_overwrite, cache_format = cache_format, key = key, secret = secret, |
297 | 71x |
url = url, version = version, endpoint = endpoint |
298 |
) |
|
299 | 47x |
data <- res$data |
300 | 47x |
final_res <- res$final_res |
301 | ||
302 |
# defragment cache |
|
303 | 47x |
if (cache != "") { |
304 | 29x |
cache <- normalizePath(cache, "/", FALSE) |
305 | 29x |
exclude <- c("id", "bin", names(api_args)) |
306 | 29x |
bin_dirs <- list.dirs(cache) |
307 | 29x |
if (length(bin_dirs) > 1) { |
308 | 2x |
if (verbose) message("defragmenting cache (", round(proc.time()[[3]] - st, 4), ")") |
309 | 29x |
write_time <- as.numeric(Sys.time()) |
310 | 29x |
for (bin_dir in bin_dirs[-1]) { |
311 | 276x |
files <- list.files(bin_dir, cache_format, full.names = TRUE) |
312 | 276x |
if (length(files) > 1) { |
313 | 2x |
previous <- files[!(files %in% cached_parts)] |
314 | 2x |
if (collect_results && length(previous)) { |
315 | 2x |
existing_cols <- unique(c("id", "bin", names(arrow::schema(arrow::open_dataset( |
316 | 2x |
previous[[1]], |
317 | 2x |
format = cache_format |
318 |
))))) |
|
319 |
if ( |
|
320 | 2x |
length(existing_cols) != ncol(final_res) || !all(existing_cols %in% colnames(final_res)) |
321 |
) { |
|
322 | ! |
if (verbose) message(" clearing existing cache since columns did not align") |
323 | ! |
unlink(previous) |
324 |
} |
|
325 |
} |
|
326 | 2x |
bin_content <- dplyr::compute(arrow::open_dataset(bin_dir, format = cache_format)) |
327 | 2x |
su <- !duplicated(as.character(bin_content$text_hash)) |
328 | ! |
if (!all(su)) bin_content <- bin_content[su, ] |
329 | 2x |
writer <- if (cache_format == "parquet") arrow::write_parquet else arrow::write_feather |
330 | 2x |
all_rows <- nrow(bin_content) |
331 | 2x |
for (i in seq_len(ceiling(all_rows / 1e9))) { |
332 | 2x |
writer( |
333 | 2x |
bin_content[seq((i - 1) * 1e9 + 1, min(all_rows, i * 1e9)), ], |
334 | 2x |
paste0(bin_dir, "/part-", write_time, "-", i, ".", cache_format) |
335 |
) |
|
336 |
} |
|
337 | 2x |
unlink(files) |
338 |
} |
|
339 |
} |
|
340 |
} |
|
341 |
} |
|
342 | ||
343 | 47x |
if (!collect_results) { |
344 | ! |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")") |
345 | 1x |
return(invisible(NULL)) |
346 |
} |
|
347 | ||
348 |
# prepare final results |
|
349 | 3x |
if (verbose) message("preparing output (", round(proc.time()[[3]] - st, 4), ")") |
350 | 46x |
rownames(final_res) <- final_res$id |
351 | 46x |
rownames(data) <- data$id |
352 | 46x |
data$text_hash <- structure(final_res$text_hash, names = data[final_res$id, "text"])[data$text] |
353 | 46x |
final_res <- cbind( |
354 | 46x |
data[, c(if (return_text) "text", if (res$provided_id) "id", "text_hash"), drop = FALSE], |
355 | 46x |
final_res[ |
356 | 46x |
structure(final_res$id, names = final_res$text_hash)[data$text_hash], |
357 | 46x |
!colnames(final_res) %in% c("id", "bin", "text_hash", "custom"), |
358 | 46x |
drop = FALSE |
359 |
] |
|
360 |
) |
|
361 | 46x |
row.names(final_res) <- NULL |
362 | 46x |
if (!is.null(output)) { |
363 | ! |
if (!grepl("\\.csv", output, TRUE)) output <- paste0(output, ".csv") |
364 | 1x |
if (compress && !grepl(".xz", output, fixed = TRUE)) output <- paste0(output, ".xz") |
365 | 1x |
if (grepl(".xz", output, fixed = TRUE)) compress <- TRUE |
366 | 1x |
if (verbose) message("writing results to file: ", output, " (", round(proc.time()[[3]] - st, 4), ")") |
367 | 3x |
dir.create(dirname(output), FALSE, TRUE) |
368 | 2x |
if (overwrite) unlink(output) |
369 | 1x |
if (compress) output <- xzfile(output) |
370 | 3x |
arrow::write_csv_arrow(final_res, file = output) |
371 |
} |
|
372 | ||
373 | 46x |
if (is.character(frameworks) && frameworks[1] != "all") { |
374 | ! |
if (verbose) message("selecting frameworks (", round(proc.time()[[3]] - st, 4), ")") |
375 | 7x |
vars <- colnames(final_res) |
376 | 7x |
sel <- grepl(paste0("^(?:", paste(tolower(frameworks), collapse = "|"), ")"), vars) |
377 | 7x |
if (any(sel)) { |
378 | 4x |
if (missing(framework_prefix) && (length(frameworks) == 1 && frameworks != "all")) framework_prefix <- FALSE |
379 | 6x |
sel <- unique(c("text", "id", "text_hash", names(api_args), vars[sel])) |
380 | 6x |
sel <- sel[sel %in% vars] |
381 | 6x |
final_res <- final_res[, sel] |
382 |
} else { |
|
383 | 1x |
warning("frameworks did not match any columns -- returning all", call. = FALSE) |
384 |
} |
|
385 |
} |
|
386 | 46x |
if (as_list) { |
387 | 1x |
if (missing(framework_prefix)) framework_prefix <- FALSE |
388 | 1x |
inall <- c("text", "id", "text_hash", names(api_args)) |
389 | 1x |
cols <- colnames(final_res) |
390 | 1x |
inall <- inall[inall %in% cols] |
391 | 1x |
pre <- sub("\\..*$", "", cols) |
392 | 1x |
pre <- unique(pre[!pre %in% inall]) |
393 | 1x |
final_res <- lapply(structure(pre, names = pre), function(f) { |
394 | 9x |
res <- final_res[, c(inall, grep(paste0("^", f), cols, value = TRUE))] |
395 | 9x |
if (!framework_prefix) colnames(res) <- sub("^.+\\.", "", colnames(res)) |
396 | 9x |
res |
397 |
}) |
|
398 | 5x |
} else if (!framework_prefix) colnames(final_res) <- sub("^.+\\.", "", colnames(final_res)) |
399 | 3x |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")") |
400 | 46x |
invisible(final_res) |
401 |
} |
1 |
#' View or Establish Custom Norming Contexts |
|
2 |
#' |
|
3 |
#' Custom norming contexts can be used to process later texts by specifying the |
|
4 |
#' \code{custom_context} API argument in the \code{receptiviti} function (e.g., |
|
5 |
#' \code{receptiviti("text to score", version = "v2", |
|
6 |
#' options = list(custom_context = "norm_name"))}, |
|
7 |
#' where \code{norm_name} is the name you set here). |
|
8 |
#' |
|
9 |
#' @param name Name of a new norming context, to be established from the provided \code{text}. |
|
10 |
#' Not providing a name will list the previously created contexts. |
|
11 |
#' @param text Text to be processed and used as the custom norming context. |
|
12 |
#' Not providing text will return the status of the named norming context. |
|
13 |
#' @param options Options to set for the norming context (e.g., |
|
14 |
#' \code{list(word_count_filter = 350,} \code{punctuation_filter = .25)}). |
|
15 |
#' @param delete Logical; If \code{TRUE}, will request to remove the \code{name} context. |
|
16 |
#' @param name_only Logical; If \code{TRUE}, will return a character vector of names |
|
17 |
#' only, including those of build-in contexts. |
|
18 |
#' @param id,text_column,id_column,files,dir,file_type,collapse_lines,encoding Additional |
|
19 |
#' arguments used to handle \code{text}; same as those in \code{\link{receptiviti}}. |
|
20 |
#' @param bundle_size,bundle_byte_limit,retry_limit,clear_scratch_cache,cores,use_future,in_memory |
|
21 |
#' Additional arguments used to manage the requests; same as those in |
|
22 |
#' \code{\link{receptiviti}}. |
|
23 |
#' @param key,secret,url Request arguments; same as those in \code{\link{receptiviti}}. |
|
24 |
#' @param verbose Logical; if \code{TRUE}, will show status messages. |
|
25 |
#' @returns Nothing if \code{delete} if \code{TRUE}. |
|
26 |
#' Otherwise, if \code{name} is not specified, a character vector containing names of each |
|
27 |
#' available norming context (built-in and custom). |
|
28 |
#' If \code{text} is not specified, the status of the |
|
29 |
#' named context in a \code{list}. If \code{text}s are provided, a \code{list}: |
|
30 |
#' \itemize{ |
|
31 |
#' \item \code{initial_status}: Initial status of the context. |
|
32 |
#' \item \code{first_pass}: Response after texts are sent the first time, or |
|
33 |
#' \code{NULL} if the initial status is \code{pass_two}. |
|
34 |
#' \item \code{second_pass}: Response after texts are sent the second time. |
|
35 |
#' } |
|
36 |
#' @examples |
|
37 |
#' \dontrun{ |
|
38 |
#' |
|
39 |
#' # get status of all existing custom norming contexts |
|
40 |
#' contexts <- receptiviti_norming(name_only = TRUE) |
|
41 |
#' |
|
42 |
#' # create or get the status of a single custom norming context |
|
43 |
#' status <- receptiviti_norming("new_context") |
|
44 |
#' |
|
45 |
#' # send texts to establish the context |
|
46 |
#' |
|
47 |
#' ## these texts can be specified just like |
|
48 |
#' ## texts in the main receptiviti function |
|
49 |
#' |
|
50 |
#' ## such as directly |
|
51 |
#' full_status <- receptiviti_norming("new_context", c( |
|
52 |
#' "a text to set the norm", |
|
53 |
#' "another text part of the new context" |
|
54 |
#' )) |
|
55 |
#' |
|
56 |
#' ## or from a file |
|
57 |
#' full_status <- receptiviti_norming( |
|
58 |
#' "new_context", "./path/to/text.csv", |
|
59 |
#' text_column = "texts" |
|
60 |
#' ) |
|
61 |
#' |
|
62 |
#' ## or from multiple files in a directory |
|
63 |
#' full_status <- receptiviti_norming( |
|
64 |
#' "new_context", |
|
65 |
#' dir = "./path/to/txt_files" |
|
66 |
#' ) |
|
67 |
#' } |
|
68 |
#' @export |
|
69 | ||
70 |
receptiviti_norming <- function(name = NULL, text = NULL, options = list(), delete = FALSE, name_only = FALSE, |
|
71 |
id = NULL, text_column = NULL, id_column = NULL, files = NULL, dir = NULL, |
|
72 |
file_type = "txt", collapse_lines = FALSE, encoding = NULL, |
|
73 |
bundle_size = 1000, bundle_byte_limit = 75e5, retry_limit = 50, |
|
74 |
clear_scratch_cache = TRUE, cores = detectCores() - 1, use_future = FALSE, in_memory = TRUE, |
|
75 |
url = Sys.getenv("RECEPTIVITI_URL"), key = Sys.getenv("RECEPTIVITI_KEY"), |
|
76 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), verbose = TRUE) { |
|
77 | 12x |
params <- handle_request_params(url, key, secret) |
78 | 12x |
if (name_only) { |
79 | 5x |
req <- curl::curl_fetch_memory(paste0(params$url, "/v2/norming"), params$handler) |
80 | 5x |
if (req$status_code != 200) { |
81 | ! |
stop( |
82 | ! |
"failed to make norming list request: ", req$status_code, |
83 | ! |
call. = FALSE |
84 |
) |
|
85 |
} |
|
86 | 5x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
87 | 5x |
if (verbose) { |
88 | 1x |
if (length(norms)) { |
89 | 1x |
message( |
90 | 1x |
"available norming context(s): ", |
91 | 1x |
paste(sub("custom/", "", norms, fixed = TRUE), collapse = ", ") |
92 |
) |
|
93 |
} else { |
|
94 | ! |
message("no custom norming contexts found") |
95 |
} |
|
96 |
} |
|
97 | 5x |
return(norms) |
98 |
} |
|
99 | ||
100 | 7x |
baseurl <- paste0(params$url, "/v2/norming/custom/") |
101 | 7x |
if (!is.null(name) && grepl("[^a-z0-9_.-]", name)) { |
102 | 1x |
stop( |
103 | 1x |
"`name` can only include lowercase letters, numbers, hyphens, underscores, or periods", |
104 | 1x |
call. = FALSE |
105 |
) |
|
106 |
} |
|
107 | ||
108 |
# list current contexts |
|
109 | 6x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
110 | 6x |
if (req$status_code != 200) { |
111 | ! |
stop( |
112 | ! |
"failed to make norming list request: ", req$status_code, |
113 | ! |
call. = FALSE |
114 |
) |
|
115 |
} |
|
116 | 6x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
117 | 6x |
if (length(norms)) { |
118 | 6x |
if (verbose && is.null(name)) { |
119 | 1x |
message( |
120 | 1x |
"custom norming context(s) found: ", |
121 | 1x |
paste(sub("custom/", "", norms$name, fixed = TRUE), collapse = ", ") |
122 |
) |
|
123 |
} |
|
124 |
} else { |
|
125 | ! |
if (verbose && is.null(name)) message("no custom norming contexts found") |
126 | ! |
norms <- NULL |
127 |
} |
|
128 | 6x |
if (is.null(name)) { |
129 | 1x |
return(norms) |
130 |
} |
|
131 | ||
132 | 5x |
context_id <- paste0("custom/", name) |
133 | 5x |
if (context_id %in% norms$name) { |
134 | 4x |
if (delete) { |
135 | 1x |
curl::handle_setopt(params$handler, customrequest = "DELETE") |
136 | 1x |
req <- curl::curl_fetch_memory(paste0(baseurl, name), params$handler) |
137 | 1x |
if (req$status_code != 200) { |
138 | ! |
message <- list(error = rawToChar(req$content)) |
139 | ! |
if (substr(message$error, 1, 1) == "{") message$error <- jsonlite::fromJSON(message$error) |
140 | ! |
stop("failed to delete custom norming context: ", message$error, call. = FALSE) |
141 |
} |
|
142 | 1x |
return(invisible(NULL)) |
143 |
} |
|
144 | 3x |
status <- as.list(norms[norms$name == context_id, ]) |
145 | 3x |
if (length(options)) { |
146 | ! |
warning( |
147 | ! |
"context ", name, " already exists, so options do not apply", |
148 | ! |
call. = FALSE |
149 |
) |
|
150 |
} |
|
151 | 1x |
} else if (!delete) { |
152 |
# establish a new context if needed |
|
153 | 1x |
if (verbose) message("requesting creation of custom context ", name) |
154 | 1x |
curl::handle_setopt( |
155 | 1x |
params$handler, |
156 | 1x |
copypostfields = jsonlite::toJSON(c(name = name, options), auto_unbox = TRUE) |
157 |
) |
|
158 | 1x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
159 | 1x |
if (req$status_code != 200) { |
160 | ! |
message <- list(error = rawToChar(req$content)) |
161 | ! |
if (substr(message$error, 1, 1) == "{") message$error <- jsonlite::fromJSON(message$error) |
162 | ! |
stop("failed to make norming creation request: ", message$error, call. = FALSE) |
163 |
} |
|
164 | 1x |
status <- jsonlite::fromJSON(rawToChar(req$content)) |
165 | 1x |
for (option in names(options)) { |
166 | 2x |
if (!(option %in% names(status))) { |
167 | 1x |
warning("option ", option, " was not set", call. = FALSE) |
168 | 1x |
} else if (status[[option]] != options[[option]]) { |
169 | ! |
warning("set option ", option, " does not match the requested value", call. = FALSE) |
170 |
} |
|
171 |
} |
|
172 |
} |
|
173 | 4x |
if (delete) { |
174 | ! |
message("context ", name, " does not exist") |
175 | ! |
return(invisible(NULL)) |
176 |
} |
|
177 | 4x |
if (verbose) { |
178 | 4x |
message( |
179 | 4x |
"status of ", name, ": ", jsonlite::toJSON(status, pretty = TRUE, auto_unbox = TRUE) |
180 |
) |
|
181 |
} |
|
182 | 4x |
if (is.null(text)) { |
183 | 3x |
return(status) |
184 |
} |
|
185 | 1x |
if (status$status == "completed") { |
186 | ! |
warning("status is `completed`, so cannot send text", call. = FALSE) |
187 | ! |
return(invisible(list( |
188 | ! |
initial_status = status, |
189 | ! |
first_pass = NULL, |
190 | ! |
second_pass = NULL |
191 |
))) |
|
192 |
} |
|
193 | 1x |
if (status$status == "pass_two") { |
194 | ! |
first_pass <- NULL |
195 |
} else { |
|
196 | 1x |
if (verbose) message("sending first-pass samples for ", name) |
197 | 1x |
first_pass <- manage_request( |
198 | 1x |
text, |
199 | 1x |
id = id, text_column = text_column, id_column = id_column, files = files, dir = dir, |
200 | 1x |
file_type = file_type, collapse_lines = collapse_lines, encoding = encoding, |
201 | 1x |
bundle_size = bundle_size, bundle_byte_limit = bundle_byte_limit, retry_limit = retry_limit, |
202 | 1x |
clear_scratch_cache = clear_scratch_cache, cores = cores, use_future = use_future, |
203 | 1x |
in_memory = in_memory, url = paste0(baseurl, name, "/one"), key = key, secret = secret, |
204 | 1x |
verbose = verbose, to_norming = TRUE |
205 | 1x |
)$final_res |
206 |
} |
|
207 | 1x |
second_pass <- NULL |
208 | 1x |
if (!is.null(first_pass$analyzed) && all(first_pass$analyzed == 0)) { |
209 | ! |
warning( |
210 | ! |
"no texts were successfully analyzed in the first pass, so second pass was skipped", |
211 | ! |
call. = FALSE |
212 |
) |
|
213 |
} else { |
|
214 | 1x |
if (verbose) message("sending second-pass samples for ", name) |
215 | 1x |
second_pass <- manage_request( |
216 | 1x |
text, |
217 | 1x |
id = id, text_column = text_column, id_column = id_column, files = files, dir = dir, |
218 | 1x |
file_type = file_type, collapse_lines = collapse_lines, encoding = encoding, |
219 | 1x |
bundle_size = bundle_size, bundle_byte_limit = bundle_byte_limit, retry_limit = retry_limit, |
220 | 1x |
clear_scratch_cache = clear_scratch_cache, cores = cores, use_future = use_future, |
221 | 1x |
in_memory = in_memory, url = paste0(baseurl, name, "/two"), key = key, secret = secret, |
222 | 1x |
verbose = verbose, to_norming = TRUE |
223 | 1x |
)$final_res |
224 |
} |
|
225 | 1x |
if (!is.null(second_pass$analyzed) && all(second_pass$analyzed == 0)) { |
226 | ! |
warning("no texts were successfully analyzed in the second pass", call. = FALSE) |
227 |
} |
|
228 | 1x |
invisible(list( |
229 | 1x |
initial_status = status, |
230 | 1x |
first_pass = first_pass, |
231 | 1x |
second_pass = second_pass |
232 |
)) |
|
233 |
} |
1 |
#' @rdname receptiviti |
|
2 |
#' @export |
|
3 | ||
4 |
receptiviti_status <- function(url = Sys.getenv("RECEPTIVITI_URL"), key = Sys.getenv("RECEPTIVITI_KEY"), |
|
5 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), verbose = TRUE, include_headers = FALSE) { |
|
6 | 62x |
params <- handle_request_params(url, key, secret) |
7 | 57x |
ping <- tryCatch( |
8 | 57x |
curl_fetch_memory(paste0(params$url, "/v1/ping"), params$handler), |
9 | 57x |
error = function(e) NULL |
10 |
) |
|
11 | 57x |
if (is.null(ping)) { |
12 | 1x |
if (verbose) message("Status: ERROR\nMessage: URL is unreachable") |
13 | 2x |
invisible(return()) |
14 |
} |
|
15 | 55x |
ping$content <- list(message = rawToChar(ping$content)) |
16 | 54x |
if (substr(ping$content, 1, 1) == "{") ping$content <- fromJSON(ping$content$message) |
17 | 55x |
ok <- ping$status_code == 200 && !length(ping$content$code) |
18 | 55x |
ping$status_message <- if (ok) { |
19 | 51x |
ping$content$pong |
20 |
} else { |
|
21 | 4x |
paste0( |
22 | 4x |
if (length(ping$content$code)) paste0(ping$status_code, " (", ping$content$code, "): "), |
23 | 4x |
if (nchar(ping$content$message) > 500 || grepl("<", ping$content$message, fixed = TRUE)) { |
24 | 1x |
ping$status_code |
25 |
} else { |
|
26 | 3x |
ping$content$message |
27 |
} |
|
28 |
) |
|
29 |
} |
|
30 | 55x |
if (verbose) { |
31 | 4x |
message("Status: ", if (ok) "OK" else "ERROR", "\nMessage: ", ping$status_message) |
32 | 4x |
if (include_headers) { |
33 | 1x |
ping$headers <- strsplit(rawToChar(ping$headers), "[\r\n]+", perl = TRUE)[[1]] |
34 | 1x |
json <- regexec("\\{.+\\}", ping$headers) |
35 | 1x |
for (i in seq_along(json)) { |
36 | 11x |
if (json[[i]] != -1) { |
37 | 2x |
regmatches(ping$headers[[i]], json[[i]]) <- paste(" ", strsplit(toJSON( |
38 | 2x |
fromJSON(regmatches(ping$headers[[i]], json[[i]])), |
39 | 2x |
auto_unbox = TRUE, pretty = TRUE |
40 | 2x |
), "\n")[[1]], collapse = "\n") |
41 |
} |
|
42 |
} |
|
43 | 1x |
message(paste0("\n", paste(" ", ping$headers, collapse = "\n"))) |
44 |
} |
|
45 |
} |
|
46 | 55x |
invisible(ping) |
47 |
} |
|
48 | ||
49 |
handle_request_params <- function(url, key, secret) { |
|
50 | 77x |
if (key == "") { |
51 | 2x |
stop( |
52 | 2x |
"specify your key, or set it to the RECEPTIVITI_KEY environment variable", |
53 | 2x |
call. = FALSE |
54 |
) |
|
55 |
} |
|
56 | 75x |
if (secret == "") { |
57 | 2x |
stop( |
58 | 2x |
"specify your secret, or set it to the RECEPTIVITI_SECRET environment variable", |
59 | 2x |
call. = FALSE |
60 |
) |
|
61 |
} |
|
62 | 73x |
url <- paste0( |
63 | 73x |
if (!grepl("http", tolower(url), fixed = TRUE)) "https://", |
64 | 73x |
sub("/+[Vv]\\d+(?:/.*)?$|/+$", "", url) |
65 |
) |
|
66 | 73x |
if (!grepl("^https?://[^.]+[.:][^.]", url, TRUE)) { |
67 | 2x |
stop( |
68 | 2x |
"url does not appear to be valid: ", url, |
69 | 2x |
call. = FALSE |
70 |
) |
|
71 |
} |
|
72 | 71x |
list(url = url, handler = new_handle(httpauth = 1, userpwd = paste0(key, ":", secret))) |
73 |
} |
1 |
#' List Available Frameworks |
|
2 |
#' |
|
3 |
#' Retrieve the list of frameworks available to your account. |
|
4 |
#' @param url,key,secret Request arguments; same as those in \code{\link{receptiviti}}. |
|
5 |
#' @returns A character vector containing the names of frameworks available to your account. |
|
6 |
#' @examples |
|
7 |
#' \dontrun{ |
|
8 |
#' |
|
9 |
#' # see which frameworks are available to your account |
|
10 |
#' frameworks <- receptiviti_frameworks() |
|
11 |
#' } |
|
12 |
#' @export |
|
13 | ||
14 |
receptiviti_frameworks <- function( |
|
15 |
url = Sys.getenv("RECEPTIVITI_URL"), key = Sys.getenv("RECEPTIVITI_KEY"), |
|
16 |
secret = Sys.getenv("RECEPTIVITI_SECRET")) { |
|
17 | 3x |
params <- handle_request_params(url, key, secret) |
18 | 2x |
req <- curl::curl_fetch_memory(paste0(params$url, "/v2/frameworks"), params$handler) |
19 | 2x |
if (req$status_code == 200) { |
20 | 2x |
return(jsonlite::fromJSON(rawToChar(req$content))) |
21 |
} |
|
22 | ! |
content <- list(message = rawToChar(req$content)) |
23 | ! |
if (substr(content$message, 1, 1) == "{") content <- jsonlite::fromJSON(content$message) |
24 | ! |
stop("failed to retrieve frameworks list: ", content$message, call. = FALSE) |
25 |
} |