diff --git a/DESCRIPTION b/DESCRIPTION index 9fd6077..1cae2d4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,8 @@ BugReports: https://github.com/r-transit/gtfsio/issues Imports: data.table, utils, - zip + zip, + jsonlite Suggests: knitr, rmarkdown, @@ -50,12 +51,13 @@ VignetteBuilder: knitr Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Collate: 'gtfsio_error.R' 'assert_gtfs.R' 'assert_inputs.R' 'checks.R' + 'data.R' 'export_gtfs.R' 'get_gtfs_standards.R' 'gtfs_methods.R' @@ -63,3 +65,6 @@ Collate: 'gtfsio.R' 'import_gtfs.R' 'new_gtfs.R' +LazyData: true +Depends: + R (>= 3.1.0) diff --git a/NAMESPACE b/NAMESPACE index 2020898..5d88933 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,3 +16,4 @@ export(import_gtfs) export(new_gtfs) importFrom(data.table,"%chin%") importFrom(data.table,":=") +importFrom(jsonlite,read_json) diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..60d60cf --- /dev/null +++ b/R/data.R @@ -0,0 +1,52 @@ +#' GTFS reference +#' +#' The data from the official GTFS specification document parsed to a list. Revision date: +#' ``r attributes(gtfs_reference)$revision_date``. +#' +#' @format +#' A list with data for every GTFS file. Each named list element (also a list) has +#' specifications for one GTFS file in the following structure: +#' \itemize{ +#' \item{`File_Name`: file name including file extension (txt or geojson)} +#' \item{`File_Presence`: Presence condition applied to the file} +#' \item{`file`: file name without file extension} +#' \item{`file_ext`: file extension} +#' \item{`fields`: data.frame with parsed field specification (columns: +#' `Field_Name`, `Type`, `Presence`, `Description`, `gtfsio_type`)} +#' \item{`primary_key`: primary key as vector} +#' \item{`field_types`: named vector on how GTFS types (values) should be read in gtfsio +#' (names). Values are the same as in `fields`.} +#' } +#' +#' @details +#' GTFS Types are converted to R types in gtfsio according to the following list: +#' `r .doc_field_types()` +#' +#' @source [https://github.com/google/transit/blob/master/gtfs/spec/en/reference.md](https://github.com/google/transit/blob/master/gtfs/spec/en/reference.md) +#' @keywords data +"gtfs_reference" + +.doc_field_types = function() { # nocov start + fields <- lapply(gtfsio::gtfs_reference, `[[`, "fields") + fields <- do.call("rbind", fields) + + type_assignment <- unique(fields[,c("Type", "gtfsio_type")]) + type_assignment <- type_assignment[!startsWith(type_assignment$Type, "Foreign ID"),] + type_assignment <- type_assignment[order(type_assignment$gtfsio_type),] + + type_assignment <- lapply(split(type_assignment, type_assignment$Type), function(ta) { + if(nrow(ta) > 1) { + ta$gtfsio_type <- paste0(ta$gtfsio_type, collapse = ", ") + ta <- ta[1,] + } + ta + }) + type_assignment <- do.call("rbind", type_assignment) + + doc <- c("\\itemize{", + paste0("\\item{", type_assignment$Type, " = \`", + type_assignment$gtfsio_type, "\`}"), + "}\n") + + return(paste(doc, collapse = "\n")) +} # nocov end diff --git a/R/export_gtfs.R b/R/export_gtfs.R index c92f937..c366fa8 100644 --- a/R/export_gtfs.R +++ b/R/export_gtfs.R @@ -2,7 +2,7 @@ #' #' Writes GTFS objects to disk as GTFS transit feeds. The object must be #' formatted according to the standards for reading and writing GTFS transit -#' feeds, as specified in \code{\link{get_gtfs_standards}} (i.e. data types are +#' feeds, as specified in \code{\link{gtfs_reference}} (i.e. data types are #' not checked). If present, does not write auxiliary tables held in a sub-list #' named \code{"."}. #' @@ -25,7 +25,7 @@ #' #' @return Invisibly returns the same GTFS object passed to \code{gtfs}. #' -#' @seealso \code{\link{get_gtfs_standards}} +#' @seealso \code{\link{gtfs_reference}} #' #' @family io functions #' @@ -52,8 +52,6 @@ export_gtfs <- function(gtfs, overwrite = TRUE, quiet = TRUE) { - gtfs_standards <- get_gtfs_standards() - # basic input checking assert_class(gtfs, "gtfs") @@ -73,7 +71,7 @@ export_gtfs <- function(gtfs, if (!as_dir & !grepl("\\.zip$", path)) error_ext_must_be_zip() if (as_dir & grepl("\\.zip$", path)) error_path_must_be_dir() - extra_files <- setdiff(files, names(gtfs_standards)) + extra_files <- setdiff(files, names(gtfsio::gtfs_reference)) if (standard_only & !is.null(files) & !identical(extra_files, character(0))) { error_non_standard_files(extra_files) } @@ -91,7 +89,7 @@ export_gtfs <- function(gtfs, # 'extra_files' is re-evaluated because 'files' might have changed in the # lines above - extra_files <- setdiff(files, names(gtfs_standards)) + extra_files <- setdiff(files, names(gtfsio::gtfs_reference)) if (standard_only) files <- setdiff(files, extra_files) @@ -119,35 +117,43 @@ export_gtfs <- function(gtfs, if (!quiet) message("Writing text files to ", tmpd) - for (file in files) { + filenames <- append_file_ext(files) + filepaths <- file.path(tmpd, filenames) + + for (i in seq_along(files)) { - filename <- paste0(file, ".txt") - filepath <- file.path(tmpd, filename) + filename <- filenames[i] + file <- files[i] + filepath <- filepaths[i] if (!quiet) message(" - Writing ", filename) dt <- gtfs[[file]] - # if 'standard_only' is set to TRUE, remove non-standard fields from 'dt' - # before writing it to disk + if(endsWith(filename, ".geojson")) { + jsonlite::write_json(dt, filepath, pretty = FALSE, auto_unbox = TRUE, digits = 8) + } else { - if (standard_only) { + # if 'standard_only' is set to TRUE, remove non-standard fields from 'dt' + # before writing it to disk - file_cols <- names(dt) - extra_cols <- setdiff(file_cols, names(gtfs_standards[[file]])) + if (standard_only) { - if (!identical(extra_cols, character(0))) dt <- dt[, !..extra_cols] + file_cols <- names(dt) + extra_cols <- setdiff(file_cols, names(gtfsio::gtfs_reference[[file]][["field_types"]])) - } + if (!identical(extra_cols, character(0))) dt <- dt[, !..extra_cols] - # print warning message if warning is raised and 'quiet' is FALSE - withCallingHandlers( - data.table::fwrite(dt, filepath, scipen = 999), - warning = function(cnd) { - if (!quiet) message(" - ", conditionMessage(cnd)) } - ) + # print warning message if warning is raised and 'quiet' is FALSE + withCallingHandlers( + data.table::fwrite(dt, filepath, scipen = 999), + warning = function(cnd) { + if (!quiet) message(" - ", conditionMessage(cnd)) + } + ) + } } # zip the contents of 'tmpd' to 'path', if as_dir = FALSE @@ -161,8 +167,6 @@ export_gtfs <- function(gtfs, unlink(path, recursive = TRUE) - filepaths <- file.path(tmpd, paste0(files, ".txt")) - zip::zip( path, filepaths, diff --git a/R/get_gtfs_standards.R b/R/get_gtfs_standards.R index 1484625..0af4bb2 100644 --- a/R/get_gtfs_standards.R +++ b/R/get_gtfs_standards.R @@ -1,6 +1,10 @@ +# nocov start + #' Generate GTFS standards #' #' @description +#' *This function is deprecated and no longer used in [import_gtfs()] or [export_gtfs()].* +#' #' Generates a list specifying the standards to be used when reading and writing #' GTFS feeds with R. Each list element (also a list) represents a distinct GTFS #' table, and describes: @@ -40,11 +44,12 @@ #' - Timezone = `character` #' - URL = `character` #' -#' @examples -#' gtfs_standards <- get_gtfs_standards() -#' +#' @examples \dontrun{ +#' gtfs_standards <- get_gtfs_standards() +#' } #' @export get_gtfs_standards <- function() { + .Deprecated("gtfs_reference") agency <- list( file_spec = "req", agency_id = list("id", "cond"), @@ -404,3 +409,5 @@ translate_types <- function(text_file, r_equivalents) { } ) } + +# nocov end diff --git a/R/import_gtfs.R b/R/import_gtfs.R index 36cdff0..2f1c369 100644 --- a/R/import_gtfs.R +++ b/R/import_gtfs.R @@ -2,7 +2,7 @@ #' #' Imports GTFS transit feeds from either a local \code{.zip} file or an URL. #' Columns are parsed according to the standards for reading and writing GTFS -#' feeds specified in \code{\link{get_gtfs_standards}}. +#' feeds specified in \code{\link{gtfs_reference}}. #' #' @param path A string. The path to a GTFS \code{.zip} file. #' @param files A character vector. The text files to be read from the GTFS, @@ -21,8 +21,7 @@ #' if an undocumented field is not specified in \code{extra_spec}, it is read #' as character (i.e. you may specify in \code{extra_spec} only the fields #' that you want to read as a different type). Only supports the -#' \code{character}, \code{integer} and \code{numeric} types, also used in -#' \code{\link{get_gtfs_standards}}. +#' \code{character}, \code{integer} and \code{numeric} types. #' @param skip A character vector. Text files that should not be read from the #' GTFS, without the \code{.txt} extension. If \code{NULL} (the default), no #' files are skipped. Cannot be used if \code{files} is set. @@ -36,7 +35,7 @@ #' @return A GTFS object: a named list of data frames, each one corresponding to #' a distinct text file from the given GTFS feed. #' -#' @seealso \code{\link{get_gtfs_standards}} +#' @seealso \code{\link{gtfs_reference}} #' #' @family io functions #' @@ -110,48 +109,48 @@ import_gtfs <- function(path, # check which files are inside the GTFS. if any non text file is found, raise # a warning and do not try to read it as a csv. remove the '.txt' extension # from the text files to reference them without it in messages and errors + # filenames: file with extension (.txt/.geojson), file: withouth extension - files_in_gtfs <- tryCatch( + filenames_in_gtfs <- tryCatch( zip::zip_list(path)$filename, error = function(cnd) cnd ) - if (inherits(files_in_gtfs, "error")) error_path_must_be_zip() + if (inherits(filenames_in_gtfs, "error")) error_path_must_be_zip() - non_text_files <- files_in_gtfs[!grepl("\\.txt$", files_in_gtfs)] + non_standard_file_ext <- filenames_in_gtfs[!(grepl("\\.txt$", filenames_in_gtfs) | grepl("\\.geojson$", filenames_in_gtfs))] - if (!identical(non_text_files, character(0))) { + if (!identical(non_standard_file_ext, character(0))) { warning( - "Found non .txt files when attempting to read the GTFS feed: ", - paste(non_text_files, collapse = ", "), "\n", + "Found non .txt or .geojson files when attempting to read the GTFS feed: ", + paste(non_standard_file_ext, collapse = ", "), "\n", "These files have been ignored and were not imported to the GTFS object.", call. = FALSE ) } - files_in_gtfs <- setdiff(files_in_gtfs, non_text_files) - files_in_gtfs <- gsub("\\.txt", "", files_in_gtfs) + filenames_in_gtfs <- setdiff(filenames_in_gtfs, non_standard_file_ext) # read only the text files specified either in 'files' or in 'skip'. # if both are NULL, read all text files - if (!is.null(files)) { - files_to_read <- files + filenames_to_read <- append_file_ext(files) } else if (!is.null(skip)) { - files_to_read <- setdiff(files_in_gtfs, skip) + filenames_to_read <- setdiff(filenames_in_gtfs, append_file_ext(skip)) } else { - files_to_read <- files_in_gtfs + filenames_to_read <- filenames_in_gtfs } # check if all specified files exist and raise an error if any does not - missing_files <- files_to_read[! files_to_read %chin% files_in_gtfs] + missing_files <- filenames_to_read[! filenames_to_read %chin% filenames_in_gtfs] + if (!identical(missing_files, character(0))) { error_gtfs_missing_files(missing_files) } # raise an error if a file is specified in 'fields' but does not appear in - # 'files_to_read' + # 'filenames_to_read' - files_misspec <- names(fields)[! names(fields) %chin% files_to_read] + files_misspec <- names(fields)[! names(fields) %chin% remove_file_ext(filenames_to_read)] if (!is.null(files_misspec) & !identical(files_misspec, character(0))) { error_files_misspecified(files_misspec) @@ -165,7 +164,7 @@ import_gtfs <- function(path, zip::unzip( path, - files = paste0(files_to_read, ".txt"), + files = filenames_to_read, exdir = tmpdir, overwrite = TRUE ) @@ -173,19 +172,14 @@ import_gtfs <- function(path, if (!quiet) message( "Unzipped the following files to ", tmpdir, ":\n", - paste0(" * ", files_to_read, ".txt", collapse = "\n") + paste0(" * ", filenames_to_read, collapse = "\n") ) - # get GTFS standards to assign correct classes to each field - - gtfs_standards <- get_gtfs_standards() - # read files into list gtfs <- lapply( - X = files_to_read, + X = filenames_to_read, FUN = read_files, - gtfs_standards, fields, extra_spec, tmpdir, @@ -197,13 +191,13 @@ import_gtfs <- function(path, # need to be stripped here file_names <- vapply( - files_to_read, + filenames_to_read, function(i) utils::tail(strsplit(i, .Platform$file.sep)[[1]], 1), character(1), USE.NAMES = FALSE ) - names(gtfs) <- file_names + names(gtfs) <- remove_file_ext(file_names) # create gtfs object from 'gtfs' @@ -219,10 +213,8 @@ import_gtfs <- function(path, #' #' Reads a GTFS text file from the main \code{.zip} file. #' -#' @param file A string. The name of the file (without \code{.txt} extension) to +#' @param file A string. The name of the file (with \code{.txt} or \code{.geojson} extension) to #' be read. -#' @param gtfs_standards A named list. Created by -#' \code{\link{get_gtfs_standards}}. #' @param fields A named list. Passed by the user to \code{\link{import_gtfs}}. #' @param extra_spec A named list. Passed by the user to #' \code{\link{import_gtfs}}. @@ -238,11 +230,10 @@ import_gtfs <- function(path, #' @return A \code{data.table} representing the desired text file according to #' the standards for reading and writing GTFS feeds with R. #' -#' @seealso \code{\link{get_gtfs_standards}} +#' @seealso \code{\link{gtfs_reference}} #' #' @keywords internal read_files <- function(file, - gtfs_standards, fields, extra_spec, tmpdir, @@ -251,13 +242,23 @@ read_files <- function(file, # create object to hold the file with '.txt' extension - file_txt <- paste0(file, ".txt") + filename <- file + file_type <- "txt" # TODO get file ext as function + if(grepl("\\.geojson$", file)) { + file_type <- "geojson" + } + file <- remove_file_ext(file) - if (!quiet) message("Reading ", file_txt) + if (!quiet) message("Reading ", file) + + # read geojson and return + if (file_type == "geojson") { + return(read_geojson(file.path(tmpdir, filename))) + } # get standards for reading and fields to be read from the given 'file' - file_standards <- gtfs_standards[[file]] + ref_fields <- gtfsio::gtfs_reference[[file]][["field_types"]] fields <- fields[[file]] extra_spec <- extra_spec[[file]] @@ -265,9 +266,9 @@ read_files <- function(file, # documented or extra files are read. throw an error if it refers to # documented fields - spec_both <- names(extra_spec)[names(extra_spec) %chin% names(file_standards)] + spec_both <- names(extra_spec)[names(extra_spec) %chin% names(ref_fields)] - if (any(names(extra_spec) %chin% names(file_standards))) { + if (any(names(extra_spec) %chin% names(ref_fields))) { error_field_is_documented(file, spec_both) } @@ -275,14 +276,14 @@ read_files <- function(file, # - if 'file_standards' is NULL then file is undocumented # - print warning message if warning is raised and 'quiet' is FALSE - if (is.null(file_standards) & !quiet) { + if (is.null(ref_fields) & !quiet) { message(" - File undocumented. Trying to read it as a csv.") } withCallingHandlers( { sample_dt <- data.table::fread( - file.path(tmpdir, file_txt), + file.path(tmpdir, filename), nrows = 1, colClasses = "character" ) @@ -326,11 +327,11 @@ read_files <- function(file, # get the standard data types of documented fields from 'file_standards' - doc_fields <- fields_to_read[fields_to_read %chin% names(file_standards)] + doc_fields <- fields_to_read[fields_to_read %chin% names(ref_fields)] doc_classes <- vapply( doc_fields, - function(field) file_standards[[field]][[1]], + function(field) ref_fields[[field]][[1]], character(1) ) @@ -356,7 +357,7 @@ read_files <- function(file, withCallingHandlers( { full_dt <- data.table::fread( - file.path(tmpdir, file_txt), + file.path(tmpdir, filename), select = fields_classes, encoding = encoding ) @@ -368,6 +369,35 @@ read_files <- function(file, } +#' Read geojson file +#' +#' @param file.geojson geojson file +#' +#' @keywords internal +#' @importFrom jsonlite read_json +read_geojson <- function(file.geojson) { + read_json(file.geojson) +} + +remove_file_ext = function(file) { + tools::file_path_sans_ext(file) +} + +append_file_ext = function(file) { + vapply(file, function(.f) { + file_ext <- gtfsio::gtfs_reference[[remove_file_ext(.f)]][["file_ext"]] + if (is.null(file_ext)) { + # use default for argument-specified non-standard files, + # behaviour defined in test_import_gtfs.R#292 + file_ext <- "txt" + } + if(endsWith(.f, paste0(".", file_ext))) { + return(.f) # file extension already present + } else { + return(paste0(.f, ".", file_ext)) + } + }, ".txt", USE.NAMES = FALSE) +} # errors ------------------------------------------------------------------ diff --git a/data/gtfs_reference.rda b/data/gtfs_reference.rda new file mode 100644 index 0000000..3cfab2d Binary files /dev/null and b/data/gtfs_reference.rda differ diff --git a/inst/extdata/locations_feed.zip b/inst/extdata/locations_feed.zip new file mode 100644 index 0000000..60ff150 Binary files /dev/null and b/inst/extdata/locations_feed.zip differ diff --git a/inst/reference/create_gtfs_reference_data.R b/inst/reference/create_gtfs_reference_data.R new file mode 100644 index 0000000..8cfc550 --- /dev/null +++ b/inst/reference/create_gtfs_reference_data.R @@ -0,0 +1,104 @@ +#' Parse the official GTFS reference markdown file and create a table +#' that assigns every GTFS field with an R-equivalent gtfsio datatype + +library(dplyr) +source("parse_markdown.R") +reference.md = curl::curl_download("https://raw.githubusercontent.com/google/transit/master/gtfs/spec/en/reference.md", tempfile()) + +# Parse current reference markdown to list of tables and bind it #### +reference_fields = parse_fields(reference.md) +f <- bind_fields_reference_list(reference_fields) + +# Link gtfs types to R types #### +f$gtfsio_type <- NA + +# Enum +f$gtfsio_type[f$Type == "Enum"] <- "integer" +# Correct non-integer enums (manual fix) +f[f$File_Name == "translations.txt" & f$Field_Name == "table_name","gtfsio_type"] <- "character" + +# ID: character +f$gtfsio_type[startsWith(f$Type, "Foreign ID")] <- "character" +f$gtfsio_type[startsWith(f$Type, "ID referencing")] <- "character" +f$gtfsio_type[f$Type %in% c("ID", "Foreign ID", "Unique ID")] <- "character" + +# Text/Strings +f$gtfsio_type[f$Type %in% c("Text", "String")] <- "character" +f$gtfsio_type[f$Type %in% c("URL", "Language code", "Currency code", "Email", + "Phone number", "Timezone", "Color", + "Text or URL or Email or Phone number")] <- "character" + +# Date and Time +f$gtfsio_type[f$Type == "Date"] <- "integer" +f$gtfsio_type[f$Type == "Time"] <- "character" + +# Numerics +f$gtfsio_type[f$Type %in% c("Latitude", "Longitude", "Non-negative float", + "Positive float", "Float", "Currency amount")] <- "numeric" +f$gtfsio_type[f$Type %in% c("Non-negative integer", "Non-zero integer", + "Positive integer", "Non-null integer", "Integer")] <- "integer" + +# Geojson +f$gtfsio_type[f$Type == "Array"] <- "geojson_array" +f$gtfsio_type[f$Type == "Object"] <- "geojson_object" + +f$Field_Name[f$File_Name == "locations.geojson"] <- gsub(" ", "", f$Field_Name[f$File_Name == "locations.geojson"]) +f$Field_Name[f$File_Name == "locations.geojson"] <- gsub("\\\\", "", f$Field_Name[f$File_Name == "locations.geojson"]) +f$Field_Name[f$File_Name == "locations.geojson"] <- gsub("-", "", f$Field_Name[f$File_Name == "locations.geojson"]) + +if(any(is.na(f$gtfsio_type))) { + stop("GTFS types without R equivalent found:\n", paste0(unique(f$Type[is.na(f$gtfsio_type)]), collapse = ", ")) +} + +# Rename columns, add file column without file extension #### +f <- f |> + mutate(file = gsub("\\.txt$", "", gsub("\\.geojson$", "", File_Name))) |> + as.data.frame() + +# Parse reference file data #### +gtfs_reference_files = cleanup_files_reference(parse_files(reference.md)) +gtfs_reference_files <- gtfs_reference_files |> + tidyr::separate(File_Name, c("file", "file_ext"), sep = "\\.", remove = F) |> + select(File_Name, File_Presence, file, file_ext) |> + as.data.frame() + +# Check file presence #### +file_presence1 = lapply(reference_fields, \(file) { + trimws(gsub("\\*", "", attributes(file)$presence)) +}) +file_presence2 = as.list(gtfs_reference_files$File_Presence) +names(file_presence2) <- gtfs_reference_files$File_Name +stopifnot(identical(file_presence2, file_presence1)) +rm(file_presence1); rm(file_presence2) + +# Extract primary keys #### +primary_keys = lapply(reference_fields, \(file) { + pk = attributes(file)$primary_key + if(is.null(pk)) return(NULL) + pk <- gsub("`", "", pk) + pk <- gsub('\\"', "", pk) + pk <- stringr::str_split_1(pk, ",") + trimws(pk) +}) + +names(primary_keys) <- remove_file_ext(names(primary_keys)) + +# Create gtfs_reference data object #### +gtfs_reference = gtfs_reference_files |> + split(gtfs_reference_files$file) |> + lapply(as.list) + +for(file in names(gtfs_reference)) { + fields = f[f$file == file,] + fields <- select(fields, -file, -File_Name) + gtfs_reference[[file]]$fields <- fields + gtfs_reference[[file]][["primary_key"]] <- primary_keys[[file]] + + field_types = fields$gtfsio_type + names(field_types) <- fields$Field_Name + gtfs_reference[[file]][["field_types"]] <- field_types +} + +attributes(gtfs_reference)$revision_date <- attributes(reference_fields)$revision_date + +usethis::use_data(gtfs_reference, internal = F, overwrite = T) diff --git a/inst/reference/parse_markdown.R b/inst/reference/parse_markdown.R new file mode 100644 index 0000000..463a17e --- /dev/null +++ b/inst/reference/parse_markdown.R @@ -0,0 +1,105 @@ +# https://stackoverflow.com/questions/48087762/markdown-table-to-data-frame-in-r +read_markdown_table = function(lines) { + lines <- lines[!grepl('^[[:blank:]+-=:_|]*$', lines)] + lines <- gsub('(^\\s*?\\|)|(\\|\\s*?$)', '', lines) + readr::read_delim(paste(lines, collapse = '\n'), delim = '|', + trim_ws = TRUE, show_col_types = FALSE) +} + +# parse all field definitions and return a list of tables +parse_fields = function(reference.md) { + field_reference_list = list() + + ref_lines = readr::read_lines(reference.md) + ref_lines[length(ref_lines)+1] <- "" # ensure empty last row + + table_index = stringr::str_starts(ref_lines, "\\| ") # lines with table markdown + + i = which(ref_lines == "## Field Definitions") + while(i <= length(ref_lines)) { + .line = ref_lines[i] + if(stringr::str_starts(.line, "### ")) { + .current_file <- stringr::str_replace_all(.line, "### ", "") + } + if(stringr::str_starts(.line, "File: ")) { + .file_presence <- stringr::str_replace_all(.line, "File: ", "") + } + if(stringr::str_starts(.line, "Primary key ")) { + .primary_key <- stringr::str_replace_all(.line, "Primary key \\(", "") + .primary_key <- stringr::str_replace_all(.primary_key, "\\)", "") + } + + # parse fields table + if(stringr::str_starts(.line, "\\|[ ]+Field Name \\| Type")) { + j = min(which(!table_index & seq_along(ref_lines) > i))-1 + ref_table = read_markdown_table(ref_lines[i:j]) + + stopifnot(!is.null(.current_file), !is.null(.file_presence)) + if(is.null(.primary_key)) stopifnot(stringr::str_ends(.current_file, "geojson")) + + # print problems if available + if(nrow(readr::problems(ref_table)) > 0) { + cat(.current_file, "\n") + print(readr::problems(ref_table)[,1:4]) + } + + # cleanup attributes + attributes(ref_table)$presence <- .file_presence + attributes(ref_table)$primary_key <- .primary_key + attributes(ref_table)$spec <- NULL # remove col_type info + attributes(ref_table)$problems <- NULL # remove col_type info + + # assign to return list + field_reference_list[[.current_file]] <- ref_table + + # clear values + .current_file <- .file_presence <- .primary_key <- NULL + } + i <- i+1 + } + + # Revision Date + revision_date = gsub("**Revised ", "", ref_lines[3], fixed = T) + revision_date <- readr::parse_date(strsplit(revision_date, "\\. See")[[1]][1], "%b %d, %Y") + attributes(field_reference_list)$revision_date <- revision_date + + return(field_reference_list) +} + +bind_fields_reference_list = function(field_reference_list) { + field_reference_list |> + bind_rows(.id = "File_Name") |> + rename(Field_Name = `Field Name`) |> + mutate(Field_Name = gsub("`", "", Field_Name)) |> + mutate(Presence = gsub("**", "", Presence, fixed = TRUE)) +} + +parse_files = function(reference.md) { + ref_lines = readr::read_lines(reference.md) + + i = which(ref_lines == "## Dataset Files") + j = which(ref_lines == "## File Requirements") + stopifnot(i < j) + ref_lines <- ref_lines[i:j] + + index = stringr::str_starts(ref_lines, "\\| ") + stopifnot(sum(diff(index)) == 0) + + files_table = read_markdown_table(ref_lines[index]) + + files_table$`File Name` <- files_table$`File Name` |> + strsplit(, split = "](", fixed = T) |> + lapply(\(x) { + gsub("[", "", x[1], fixed = T) + }) |> unlist() + + return(files_table) +} + +# cleanup +cleanup_files_reference = function(files_table) { + files_table |> + rename(File_Name = `File Name`) |> + mutate(Presence = gsub("**", "", Presence, fixed = TRUE)) |> + select(File_Name, File_Presence = Presence) +} diff --git a/inst/tinytest/test_export_gtfs.R b/inst/tinytest/test_export_gtfs.R index ad11850..44c55c3 100644 --- a/inst/tinytest/test_export_gtfs.R +++ b/inst/tinytest/test_export_gtfs.R @@ -2,7 +2,6 @@ path <- system.file("extdata/ggl_gtfs.zip", package = "gtfsio") gtfs <- import_gtfs(path) tmpf <- tempfile(fileext = ".zip") tmpd <- tempfile() -gtfs_standards <- get_gtfs_standards() tester <- function(gtfs_obj = gtfs, path = tmpf, @@ -186,25 +185,25 @@ expect_true( ) ) -for (file in list.files(tmpd)) { +for (filenames in list.files(tmpd)) { # all existing fields should be standard - no_txt_file <- sub(".txt", "", file) - std_fields <- setdiff(names(gtfs_standards[[no_txt_file]]), "file_spec") - existing_fields <- readLines(file.path(tmpd, file), n = 1L) + file <- gtfsio:::remove_file_ext(filenames) + std_fields <- names(gtfsio::gtfs_reference[[file]][["field_types"]]) + existing_fields <- readLines(file.path(tmpd, filenames), n = 1L) existing_fields <- strsplit(existing_fields, ",")[[1]] - expect_true(all(existing_fields %in% std_fields), info = no_txt_file) + expect_true(all(existing_fields %in% std_fields), info = file) # all standard fields in the object should be written - std_fields_in_obj <- names(gtfs[[no_txt_file]]) + std_fields_in_obj <- names(gtfs[[file]]) std_fields_in_obj <- std_fields_in_obj[std_fields_in_obj %in% std_fields] expect_true( all(std_fields_in_obj %in% existing_fields), - info = no_txt_file + info = file ) } @@ -294,3 +293,14 @@ resulting_shapes_content <- readLines(file.path(target_dir, "shapes.txt")) expect_false(identical(resulting_shapes_content[3], "b,2,41,41,1e+07")) expect_identical(resulting_shapes_content[3], "b,2,41,41,10000000") + +# issue #36 --------------------------------------------------------------- +# re-reading written json files are the same + +locations_feed <- import_gtfs(system.file("extdata/locations_feed.zip", package = "gtfsio")) +tmpfile <- tempfile(fileext = ".zip") +export_gtfs(locations_feed, tmpfile) + +reimported <- import_gtfs(tmpfile) + +expect_equal(reimported, locations_feed) diff --git a/inst/tinytest/test_import_gtfs.R b/inst/tinytest/test_import_gtfs.R index 728538f..a83cd21 100644 --- a/inst/tinytest/test_import_gtfs.R +++ b/inst/tinytest/test_import_gtfs.R @@ -55,7 +55,7 @@ expect_error( tester(files = "ola"), pattern = paste0( "The provided GTFS feed doesn't contain the following ", - "text file\\(s\\): 'ola'" + "text file\\(s\\): 'ola.txt'" ), class = "gtfs_missing_files" ) @@ -144,17 +144,7 @@ expect_identical(gtfs_fields, list(shapes = "shape_id", trips = "trip_id")) # get the standard type in R used to read each field -gtfs_standards <- get_gtfs_standards() - -standard_types <- lapply( - gtfs_standards, - function(file) { - fields <- setdiff(names(file), "file_spec") - types <- vapply(fields, function(f) file[[f]][[1]], character(1)) - types <- types[order(names(types))] - } -) -standard_types <- standard_types[order(names(standard_types))] +standard_types <- lapply(gtfs_reference, `[[`, "field_types") # get the type actually used to read each field @@ -431,3 +421,21 @@ expect_error(tester(not_gtfs_file), class = "path_must_be_zip") not_gtfs_url <- "https://www.google.com" expect_error(tester(not_gtfs_url), class = "path_must_be_zip") + +# issue #36 --------------------------------------------------------------- +# locations.geojson files should be read without warning + +expect_silent( + import_gtfs(system.file("extdata/locations_feed.zip", package = "gtfsio")) +) + +locations_feed <- import_gtfs(system.file("extdata/locations_feed.zip", package = "gtfsio")) + +expect_inherits(locations_feed[["locations"]], "list") +expect_equal(names(locations_feed[["locations"]]), c("type", "name", "crs", "features")) + +# file extension handling +file_exts <- gtfsio:::append_file_ext(c(names(gtfs_reference), "dummy")) +expect_equal(file_exts[which(names(gtfs_reference) == "locations")], "locations.geojson") +expect_equal(file_exts[length(file_exts)], "dummy.txt") +expect_equal(gtfsio:::append_file_ext(file_exts), file_exts) diff --git a/man/export_gtfs.Rd b/man/export_gtfs.Rd index 47b1b5c..705c7f8 100644 --- a/man/export_gtfs.Rd +++ b/man/export_gtfs.Rd @@ -46,7 +46,7 @@ Invisibly returns the same GTFS object passed to \code{gtfs}. \description{ Writes GTFS objects to disk as GTFS transit feeds. The object must be formatted according to the standards for reading and writing GTFS transit -feeds, as specified in \code{\link{get_gtfs_standards}} (i.e. data types are +feeds, as specified in \code{\link{gtfs_reference}} (i.e. data types are not checked). If present, does not write auxiliary tables held in a sub-list named \code{"."}. } @@ -65,7 +65,7 @@ zip::zip_list(tmpf)$filename } \seealso{ -\code{\link{get_gtfs_standards}} +\code{\link{gtfs_reference}} Other io functions: \code{\link{import_gtfs}()} diff --git a/man/get_gtfs_standards.Rd b/man/get_gtfs_standards.Rd index 7dcd6c1..abf3a96 100644 --- a/man/get_gtfs_standards.Rd +++ b/man/get_gtfs_standards.Rd @@ -11,6 +11,8 @@ A named list, in which each element represents the R equivalent of each GTFS table standard. } \description{ +\emph{This function is deprecated and no longer used in \code{\link[=import_gtfs]{import_gtfs()}} or \code{\link[=export_gtfs]{export_gtfs()}}.} + Generates a list specifying the standards to be used when reading and writing GTFS feeds with R. Each list element (also a list) represents a distinct GTFS table, and describes: @@ -51,6 +53,7 @@ represent each GTFS data type are described below: } \examples{ -gtfs_standards <- get_gtfs_standards() - +\dontrun{ + gtfs_standards <- get_gtfs_standards() +} } diff --git a/man/gtfs_reference.Rd b/man/gtfs_reference.Rd new file mode 100644 index 0000000..569d901 --- /dev/null +++ b/man/gtfs_reference.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{gtfs_reference} +\alias{gtfs_reference} +\title{GTFS reference} +\format{ +A list with data for every GTFS file. Each named list element (also a list) has +specifications for one GTFS file in the following structure: +\itemize{ +\item{\code{File_Name}: file name including file extension (txt or geojson)} +\item{\code{File_Presence}: Presence condition applied to the file} +\item{\code{file}: file name without file extension} +\item{\code{file_ext}: file extension} +\item{\code{fields}: data.frame with parsed field specification (columns: +\code{Field_Name}, \code{Type}, \code{Presence}, \code{Description}, \code{gtfsio_type})} +\item{\code{primary_key}: primary key as vector} +\item{\code{field_types}: named vector on how GTFS types (values) should be read in gtfsio +(names). Values are the same as in \code{fields}.} +} +} +\source{ +\url{https://github.com/google/transit/blob/master/gtfs/spec/en/reference.md} +} +\usage{ +gtfs_reference +} +\description{ +The data from the official GTFS specification document parsed to a list. Revision date: +\code{2024-08-16}. +} +\details{ +GTFS Types are converted to R types in gtfsio according to the following list: +\itemize{ +\item{Array = \code{geojson_array}} +\item{Color = \code{character}} +\item{Currency amount = \code{numeric}} +\item{Currency code = \code{character}} +\item{Date = \code{integer}} +\item{Email = \code{character}} +\item{Enum = \verb{character, integer}} +\item{Float = \code{numeric}} +\item{ID = \code{character}} +\item{Integer = \code{integer}} +\item{Language code = \code{character}} +\item{Latitude = \code{numeric}} +\item{Longitude = \code{numeric}} +\item{Non-negative float = \code{numeric}} +\item{Non-negative integer = \code{integer}} +\item{Non-null integer = \code{integer}} +\item{Non-zero integer = \code{integer}} +\item{Object = \code{geojson_object}} +\item{Phone number = \code{character}} +\item{Positive float = \code{numeric}} +\item{Positive integer = \code{integer}} +\item{String = \code{character}} +\item{Text = \code{character}} +\item{Text or URL or Email or Phone number = \code{character}} +\item{Time = \code{character}} +\item{Timezone = \code{character}} +\item{URL = \code{character}} +\item{Unique ID = \code{character}} +} +} +\keyword{data} diff --git a/man/import_gtfs.Rd b/man/import_gtfs.Rd index 49f88b8..758f2eb 100644 --- a/man/import_gtfs.Rd +++ b/man/import_gtfs.Rd @@ -35,8 +35,7 @@ undocumented fields, in the format if an undocumented field is not specified in \code{extra_spec}, it is read as character (i.e. you may specify in \code{extra_spec} only the fields that you want to read as a different type). Only supports the -\code{character}, \code{integer} and \code{numeric} types, also used in -\code{\link{get_gtfs_standards}}.} +\code{character}, \code{integer} and \code{numeric} types.} \item{skip}{A character vector. Text files that should not be read from the GTFS, without the \code{.txt} extension. If \code{NULL} (the default), no @@ -57,7 +56,7 @@ a distinct text file from the given GTFS feed. \description{ Imports GTFS transit feeds from either a local \code{.zip} file or an URL. Columns are parsed according to the standards for reading and writing GTFS -feeds specified in \code{\link{get_gtfs_standards}}. +feeds specified in \code{\link{gtfs_reference}}. } \examples{ gtfs_path <- system.file("extdata/ggl_gtfs.zip", package = "gtfsio") @@ -84,7 +83,7 @@ gtfs <- import_gtfs( } \seealso{ -\code{\link{get_gtfs_standards}} +\code{\link{gtfs_reference}} Other io functions: \code{\link{export_gtfs}()} diff --git a/man/read_files.Rd b/man/read_files.Rd index 8179cb9..10fddd8 100644 --- a/man/read_files.Rd +++ b/man/read_files.Rd @@ -4,15 +4,12 @@ \alias{read_files} \title{Read a GTFS text file} \usage{ -read_files(file, gtfs_standards, fields, extra_spec, tmpdir, quiet, encoding) +read_files(file, fields, extra_spec, tmpdir, quiet, encoding) } \arguments{ -\item{file}{A string. The name of the file (without \code{.txt} extension) to +\item{file}{A string. The name of the file (with \code{.txt} or \code{.geojson} extension) to be read.} -\item{gtfs_standards}{A named list. Created by -\code{\link{get_gtfs_standards}}.} - \item{fields}{A named list. Passed by the user to \code{\link{import_gtfs}}.} \item{extra_spec}{A named list. Passed by the user to @@ -37,6 +34,6 @@ the standards for reading and writing GTFS feeds with R. Reads a GTFS text file from the main \code{.zip} file. } \seealso{ -\code{\link{get_gtfs_standards}} +\code{\link{gtfs_reference}} } \keyword{internal} diff --git a/man/read_geojson.Rd b/man/read_geojson.Rd new file mode 100644 index 0000000..87a5425 --- /dev/null +++ b/man/read_geojson.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/import_gtfs.R +\name{read_geojson} +\alias{read_geojson} +\title{Read geojson file} +\usage{ +read_geojson(file.geojson) +} +\arguments{ +\item{file.geojson}{geojson file} +} +\description{ +Read geojson file +} +\keyword{internal}