diff --git a/R/process_scotgov_deaths.R b/R/process_scotgov_deaths.R index 351564b..e852eb1 100644 --- a/R/process_scotgov_deaths.R +++ b/R/process_scotgov_deaths.R @@ -31,7 +31,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_health_board/per_week-covid_related_deaths", + component = "nhs_health_board/week-covid_related_deaths", array = as.matrix(covid_deaths_per_week_by_nhsboard), dimension_names = list( `health board` = rownames(covid_deaths_per_week_by_nhsboard), @@ -53,7 +53,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "council_area/per_week-covid_related_deaths", + component = "council_area/week-covid_related_deaths", array = as.matrix(covid_deaths_per_week_by_councilarea), dimension_names = list( `council area` = rownames( @@ -98,7 +98,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_age/per_week/by_gender-country-covid_related_deaths", + component = "age_group/week/gender-country-covid_related_deaths", array = array(c(female, male), dim = c(dim(female), 2)), dimension_names = list( `age group` = rownames(covid_deaths_per_week_by_agegroup_f), @@ -114,7 +114,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_age/per_week-persons-country-covid_related_deaths", + component = "age_group/week-persons-country-covid_related_deaths", array = as.matrix(covid_deaths_per_week_by_agegroup_all), dimension_names = list( `age group` = rownames(covid_deaths_per_week_by_agegroup_all), @@ -134,7 +134,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_location/per_week-covid_related_deaths", + component = "location_type/week-covid_related_deaths", array = as.matrix(covid_deaths_per_week_by_location), dimension_names = list( `location` = rownames(covid_deaths_per_week_by_location), @@ -175,7 +175,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_health_board/per_week-all_deaths", + component = "nhs_health_board/week-all_deaths", array = as.matrix(all_deaths_per_week_by_nhsboard), dimension_names = list( `health board` = rownames(all_deaths_per_week_by_nhsboard), @@ -197,7 +197,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_council_area/per_week-all_deaths", + component = "council_area/week-all_deaths", array = as.matrix(all_deaths_per_week_by_councilarea), dimension_names = list( `council area` = rownames(all_deaths_per_week_by_councilarea), @@ -241,7 +241,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_age/per_week/by_gender-country-all_deaths", + component = "age_group/week/gender-country-all_deaths", array = array(c(female, male), dim = c(dim(female), 2)), dimension_names = list( `age group` = rownames(all_deaths_per_week_by_agegroup_f), @@ -258,7 +258,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_age/per_week-persons-country-all_deaths", + component = "age_group/week-persons-country-all_deaths", array = as.matrix(all_deaths_per_week_by_agegroup_all), dimension_names = list( `age group` = rownames(all_deaths_per_week_by_agegroup_all), @@ -279,7 +279,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_location/per_week-all_deaths", + component = "location_type/week-all_deaths", array = as.matrix(all_deaths_per_week_by_location), dimension_names = list( `location` = rownames(all_deaths_per_week_by_location), @@ -303,7 +303,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "per_week-persons-scotland-all_deaths-averaged_over_5years", + component = "week-persons-scotland-all_deaths-averaged_over_5years", array = as.matrix(all_deaths_per_week_averaged_over_5years), dimension_names = list( `total` = rownames( @@ -331,7 +331,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_health_board/by_location-covid_related_deaths", + component = "nhs_health_board/location_type-covid_related_deaths", array = as.matrix(covid_deaths_by_nhsboard_and_location), dimension_names = list( `health board` = rownames( @@ -350,7 +350,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_health_board/by_location-all_deaths", + component = "nhs_health_board/location_type-all_deaths", array = as.matrix(all_deaths_by_nhsboard_and_location), dimension_names = list( `health board` = rownames( @@ -368,7 +368,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_council_area/by_location-covid_related_deaths", + component = "council_area/location_type-covid_related_deaths", array = as.matrix(covid_deaths_by_councilarea_and_location), dimension_names = list( `council area` = rownames( @@ -386,7 +386,7 @@ process_scotgov_deaths <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "by_council_area/by_location-all_deaths", + component = "council_area/location_type-all_deaths", array = as.matrix(all_deaths_by_councilarea_and_location), dimension_names = list( `council area` = rownames( diff --git a/R/process_scotgov_management.R b/R/process_scotgov_management.R index 32894e9..e03f13f 100644 --- a/R/process_scotgov_management.R +++ b/R/process_scotgov_management.R @@ -6,7 +6,7 @@ process_scotgov_management <- function(sourcefile, filename) { scotMan <- read.csv(file = sourcefile) %>% dplyr::mutate(featurecode = gsub( - "", "", featurecode)) %>% dplyr::mutate(count = dplyr::case_when(count == "*" ~ "0", @@ -22,7 +22,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("variable") SCRCdataAPI::create_array(filename = filename, - component = "scotland/calls", + component = "call_centre/date-number_of_calls", array = as.matrix(calls.dat), dimension_names = list( helpline = rownames(calls.dat), @@ -49,7 +49,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("variable") SCRCdataAPI::create_array(filename = filename, - component = "scotland/hospital", + component = "confirmed_suspected_total/date-country-hospital", array = as.matrix(patients.in.hospital.dat), dimension_names = list( status = rownames(patients.in.hospital.dat), @@ -61,7 +61,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("variable") SCRCdataAPI::create_array(filename = filename, - component = "scotland/icu", + component = "confirmed_suspected_total/date-country-icu", array = as.matrix(patients.in.icu.dat), dimension_names = list( status = rownames(patients.in.icu.dat), @@ -77,7 +77,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("variable") SCRCdataAPI::create_array(filename = filename, - component = "special_health_board/hospital", + component = "confirmed_suspected/date-country-hospital-special_health_board", array = as.matrix(special.patients.in.hosp.dat), dimension_names = list( status = rownames(special.patients.in.hosp.dat), @@ -88,12 +88,13 @@ process_scotgov_management <- function(sourcefile, filename) { reshape2::dcast(variable ~ date, value.var = "count") %>% tibble::column_to_rownames("variable") - SCRCdataAPI::create_array(filename = filename, - component = "special_health_board/icu", - array = as.matrix(special.patients.in.icu.dat), - dimension_names = list( - status = rownames(special.patients.in.icu.dat), - date = colnames(special.patients.in.icu.dat))) + SCRCdataAPI::create_array( + filename = filename, + component = "date-country-icu-special_health_board-total", + array = as.matrix(special.patients.in.icu.dat), + dimension_names = list( + status = rownames(special.patients.in.icu.dat), + date = colnames(special.patients.in.icu.dat))) # NHS health board hosp.nhs.dat <- hospital.dat %>% @@ -106,7 +107,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("featurecode") SCRCdataAPI::create_array(filename = filename, - component = "nhs_health_board/hospital/total", + component = "nhs_health_board/date-icu-total", array = as.matrix(hosp.nhs.total.dat), dimension_names = list( `health board` = rownames(hosp.nhs.total.dat), @@ -118,12 +119,13 @@ process_scotgov_management <- function(sourcefile, filename) { reshape2::dcast(featurecode ~ date, value.var = "count") %>% tibble::column_to_rownames("featurecode") - SCRCdataAPI::create_array(filename = filename, - component = "nhs_health_board/hospital/suspected", - array = as.matrix(hosp.nhs.suspected.dat), - dimension_names = list( - `health board` = rownames(hosp.nhs.suspected.dat), - date = colnames(hosp.nhs.suspected.dat))) + SCRCdataAPI::create_array( + filename = filename, + component = "nhs_health_board/date-hospital-suspected", + array = as.matrix(hosp.nhs.suspected.dat), + dimension_names = list( + `health board` = rownames(hosp.nhs.suspected.dat), + date = colnames(hosp.nhs.suspected.dat))) hosp.nhs.confirmed.dat <- hosp.nhs.dat %>% dplyr::filter(grepl("Confirmed", variable)) %>% @@ -131,12 +133,13 @@ process_scotgov_management <- function(sourcefile, filename) { reshape2::dcast(featurecode ~ date, value.var = "count") %>% tibble::column_to_rownames("featurecode") - SCRCdataAPI::create_array(filename = filename, - component = "nhs_health_board/hospital/confirmed", - array = as.matrix(hosp.nhs.confirmed.dat), - dimension_names = list( - `health board` = rownames(hosp.nhs.confirmed.dat), - date = colnames(hosp.nhs.confirmed.dat))) + SCRCdataAPI::create_array( + filename = filename, + component = "nhs_health_board/date-hospital-confirmed", + array = as.matrix(hosp.nhs.confirmed.dat), + dimension_names = list( + `health board` = rownames(hosp.nhs.confirmed.dat), + date = colnames(hosp.nhs.confirmed.dat))) # 3 ----------------------------------------------------------------------- # Numbers of ambulance attendances (total and COVID-19 suspected) and number of @@ -148,7 +151,7 @@ process_scotgov_management <- function(sourcefile, filename) { tibble::column_to_rownames("variable") SCRCdataAPI::create_array(filename = filename, - component = "scotland/ambulance_attendances", + component = "ambulance_attendances/date", array = as.matrix(ambulance.dat), dimension_names = list( status = rownames(ambulance.dat), @@ -163,7 +166,7 @@ process_scotgov_management <- function(sourcefile, filename) { dplyr::select(-"1") SCRCdataAPI::create_array(filename = filename, - component = "scotland/delayed_discharges", + component = "date-delayed_discharges", array = as.matrix(discharges.dat), dimension_names = list( delayed = rownames(discharges.dat), @@ -191,7 +194,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/testing/daily/people_found_positive", + component = "date-country-tested_positive", array = as.matrix(testing.daily.positive), dimension_names = list( delayed = rownames(testing.daily.positive), @@ -205,7 +208,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/testing/cumulative/tests_carried_out", + component = "testing_location/date-cumulative", array = as.matrix(testing.cumulative), dimension_names = list( delayed = rownames(testing.cumulative), @@ -219,7 +222,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/testing/daily/tests_carried_out", + component = "testing_location/date", array = as.matrix(testing.daily), dimension_names = list( delayed = rownames(testing.daily), @@ -233,7 +236,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/testing/cumulative/people_tested", + component = "test_result/date-cumulative", array = as.matrix(testing.country.cumulative), dimension_names = list( delayed = rownames(testing.country.cumulative), @@ -250,12 +253,13 @@ process_scotgov_management <- function(sourcefile, filename) { reshape2::dcast(featurecode ~ date, value.var = "count") %>% tibble::column_to_rownames("featurecode") - SCRCdataAPI::create_array(filename = filename, - component = "nhs_health_board/testing", - array = as.matrix(testing.area.dat), - dimension_names = list( - delayed = rownames(testing.area.dat), - date = colnames(testing.area.dat))) + SCRCdataAPI::create_array( + filename = filename, + component = "nhs_health_board/date-testing-cumulative", + array = as.matrix(testing.area.dat), + dimension_names = list( + delayed = rownames(testing.area.dat), + date = colnames(testing.area.dat))) # 6 ----------------------------------------------------------------------- # Numbers of NHS workforce reporting as absent due to a range of reasons @@ -269,7 +273,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/nhs_workforce_covid19_absences", + component = "nhs_workforce/date-country-covid_related_absences", array = as.matrix(nhs.dat), dimension_names = list( delayed = rownames(nhs.dat), @@ -293,7 +297,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/proportion_of_cases", + component = "date-country-carehomes-proportion_that_have_reported_a_suspected_case", array = as.matrix(carehomes.proportion.dat), dimension_names = list( delayed = rownames(carehomes.proportion.dat), @@ -307,7 +311,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/staff_absence_rate", + component = "date-country-carehomes-staff_absence_rate", array = as.matrix(carehomes.absence.rate.dat), dimension_names = list( delayed = rownames(carehomes.absence.rate.dat), @@ -321,7 +325,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/response_rate", + component = "date-country-carehomes-response_rate", array = as.matrix(carehomes.response.rate.dat), dimension_names = list( delayed = rownames(carehomes.response.rate.dat), @@ -339,7 +343,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/cumulative_number_reports", + component = "suspected_vs_reported/date-country-carehomes-cumulative", array = as.matrix(carehomes.count.cum.dat), dimension_names = list( delayed = rownames(carehomes.count.cum.dat), @@ -353,7 +357,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/cumulative_number_suspected", + component = "date-country-carehomes-cumulative_number_of_suspected_cases", array = as.matrix(carehomes.cum.numdat), dimension_names = list( delayed = rownames(carehomes.cum.numdat), @@ -367,7 +371,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/new_suspected_cases", + component = "date-country-carehomes-new_suspected_cases", array = as.matrix(carehomes.count.daily.dat), dimension_names = list( delayed = rownames(carehomes.count.daily.dat), @@ -381,7 +385,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/staff_reported_absent", + component = "date-country-carehomes-staff_reported_absent", array = as.matrix(carehomes.count.staff.dat), dimension_names = list( delayed = rownames(carehomes.count.staff.dat), @@ -396,7 +400,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/carehome_with_suspected_cases", + component = "number_vs_revised_number/date-country-carehomes-carehomes_with_suspected_cases", array = as.matrix(carehomes.count.sus.dat), dimension_names = list( delayed = rownames(carehomes.count.sus.dat), @@ -411,7 +415,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/carehomes_submitted_return", + component = "date-country-carehomes-carehomes_submitted_return", array = as.matrix(carehomes.count.carehomes.return.dat), dimension_names = list( delayed = rownames(carehomes.count.carehomes.return.dat), @@ -425,7 +429,7 @@ process_scotgov_management <- function(sourcefile, filename) { SCRCdataAPI::create_array( filename = filename, - component = "scotland/carehomes/staff_submitted_return", + component = "date-country-carehomes-staff_submitted_return", array = as.matrix(carehomes.count.total.staff.dat), dimension_names = list( delayed = rownames(carehomes.count.total.staff.dat), @@ -441,7 +445,7 @@ process_scotgov_management <- function(sourcefile, filename) { dplyr::select(-"1") SCRCdataAPI::create_array(filename = filename, - component = "scotland/deaths", + component = "date-country-deaths_registered", array = as.matrix(deaths.dat), dimension_names = list( delayed = rownames(deaths.dat), diff --git a/inst/scripts/scotgov_deaths.R b/inst/scripts/scotgov_deaths.R index ee90237..63e4197 100644 --- a/inst/scripts/scotgov_deaths.R +++ b/inst/scripts/scotgov_deaths.R @@ -8,23 +8,20 @@ library(SCRCdataAPI) library(SCRCdata) - -# initialise parameters --------------------------------------------------- - key <- read.table("token.txt") -namespace <- "SCRC" -doi_or_unique_name <- "scottish scottish deaths-involving-coronavirus-covid-19" + +# initialise parameters --------------------------------------------------- product_name <- paste("human", "infection", "SARS-CoV-2", "scotland", "mortality", sep = "/") -todays_date <- as.POSIXct("2020-07-15 17:46:00", - format = "%Y-%m-%d %H:%M:%S") -version <- 0 +todays_date <- Sys.time() +version <- 0.0 +doi_or_unique_name <- "scottish scottish deaths-involving-coronavirus-covid-19" # where was the source data download from? (original source) -dataset_name <- "Scottish Government Open Data Repository" +source_name <- "Scottish Government Open Data Repository" original_root <- "https://statistics.gov.scot/sparql.csv?query=" original_path <- "PREFIX qb: PREFIX data: @@ -60,6 +57,7 @@ WHERE { repo_storageRoot <- "github" script_gitRepo <- "ScottishCovidResponse/SCRCdata" repo_version <- "0.1.0" +processing_script <- "scotgov_deaths.R" @@ -67,6 +65,8 @@ repo_version <- "0.1.0" # Additional parameters (automatically generated) ------------------------- +namespace <- "SCRC" + # when was the source data downloaded? source_downloadDate <- todays_date @@ -92,9 +92,8 @@ source_storageRoot <- "boydorr" source_path <- file.path(product_name, source_filename) # where is the submission script stored? -script_storageRoot <- "boydorr" -script_filename <- "exec.sh" -script_path <- file.path(product_name, script_filename) +script_storageRoot <- "text_file" +submission_text <- paste0("R -f inst/scripts/", processing_script) # where is the data product stored? product_storageRoot <- "boydorr" @@ -106,7 +105,7 @@ product_path <- file.path(product_name, product_filename) # original source name original_sourceId <- new_source( - name = dataset_name, + name = source_name, abbreviation = "Scottish Government Open Data Repository", website = "https://statistics.gov.scot/", key = key) @@ -124,8 +123,10 @@ source_storageRootId <- new_storage_root(name = source_storageRoot, # submission script storage root script_storageRootId <- new_storage_root(name = script_storageRoot, - root = "ftp://boydorr.gla.ac.uk/scrc/", + root = "https://data.scrc.uk/api/text_file/", key = key) +tmp <- gsub("^.*/([0-9]+)/$", "\\1", script_storageRootId) +script_path <- paste0(tmp, "/?format=text") # data product storage root product_storageRootId <- new_storage_root(name = product_storageRoot, @@ -170,7 +171,7 @@ sourceDataURIs <- upload_source_data( # generate data product --------------------------------------------------- -scriptURIs <- process_scotgov_deaths( +process_scotgov_deaths( sourcefile = file.path(local_path, source_filename), filename = file.path(local_path, product_filename)) @@ -189,12 +190,13 @@ dataProductURIs <- upload_data_product( -# upload processing script metadata to the registry ----------------------- +# upload submission script metadata to the registry ----------------------- submissionScriptURIs <- upload_submission_script( storage_root_id = script_storageRootId, path = script_path, - hash = get_github_hash(script_gitRepo), + hash = openssl::sha1(submission_text), + text = submission_text, run_date = script_processingDate, key = key) @@ -217,5 +219,3 @@ upload_object_links(run_date = script_processingDate, inputs = list(sourceDataURIs$source_objectComponentId), outputs = dataProductURIs$product_objectComponentId, key = key) - - diff --git a/inst/scripts/scotgov_management.R b/inst/scripts/scotgov_management.R index 405795b..da5a8c6 100644 --- a/inst/scripts/scotgov_management.R +++ b/inst/scripts/scotgov_management.R @@ -11,23 +11,21 @@ library(SCRCdataAPI) library(SCRCdata) - -# initialise parameters --------------------------------------------------- - key <- read.table("token.txt") -namespace <- "SCRC" -doi_or_unique_name <- "scottish coronavirus-covid-19-management-information" + +# initialise parameters --------------------------------------------------- product_name <- paste("human", "infection", "SARS-CoV-2", "scotland", "cases_and_management", sep = "/") -todays_date <- as.POSIXct("2020-07-15 17:46:00", - format = "%Y-%m-%d %H:%M:%S") -version <- 0 +todays_date <- Sys.time() +version <- 0.0 +doi_or_unique_name <- "scottish coronavirus-covid-19-management-information" + # where was the source data download from? (original source) -dataset_name <- "Scottish Government Open Data Repository" +source_name <- "Scottish Government Open Data Repository" original_root <- "https://statistics.gov.scot/sparql.csv?query=" original_path <- "PREFIX qb: PREFIX data: @@ -55,13 +53,15 @@ WHERE { repo_storageRoot <- "github" script_gitRepo <- "ScottishCovidResponse/SCRCdata" repo_version <- "0.1.0" - +processing_script <- "scotgov_management.R" # Additional parameters (automatically generated) ------------------------- +namespace <- "SCRC" + # when was the source data downloaded? source_downloadDate <- todays_date @@ -87,9 +87,8 @@ source_storageRoot <- "boydorr" source_path <- file.path(product_name, source_filename) # where is the submission script stored? -script_storageRoot <- "boydorr" -script_path <- file.path(product_name, source_filename) -script_filename <- "exec.sh" +script_storageRoot <- "text_file" +submission_text <- paste0("R -f inst/scripts/", processing_script) # where is the data product stored? product_storageRoot <- "boydorr" @@ -101,7 +100,7 @@ product_path <- file.path(product_name, product_filename) # original source name original_sourceId <- new_source( - name = dataset_name, + name = source_name, abbreviation = "Scottish Government Open Data Repository", website = "https://statistics.gov.scot/", key = key) @@ -119,8 +118,10 @@ source_storageRootId <- new_storage_root(name = source_storageRoot, # submission script storage root script_storageRootId <- new_storage_root(name = script_storageRoot, - root = "ftp://boydorr.gla.ac.uk/scrc/", + root = "https://data.scrc.uk/api/text_file/", key = key) +tmp <- gsub("^.*/([0-9]+)/$", "\\1", script_storageRootId) +script_path <- paste0(tmp, "/?format=text") # data product storage root product_storageRootId <- new_storage_root(name = product_storageRoot, @@ -165,7 +166,7 @@ sourceDataURIs <- upload_source_data( # generate data product --------------------------------------------------- -scriptURIs <- process_scotgov_management( +process_scotgov_management( sourcefile = file.path(local_path, source_filename), filename = file.path(local_path, product_filename)) @@ -184,13 +185,13 @@ dataProductURIs <- upload_data_product( -# upload processing script metadata to the registry ----------------------- +# upload submission script metadata to the registry ----------------------- submissionScriptURIs <- upload_submission_script( storage_root_id = script_storageRootId, - path = product_name, - filename = script_filename, - hash = get_github_hash(script_gitRepo), + path = script_path, + hash = openssl::sha1(submission_text), + text = submission_text, run_date = script_processingDate, key = key) diff --git a/inst/templates/upload_data_product.R b/inst/templates/upload_data_product.R new file mode 100644 index 0000000..a24313d --- /dev/null +++ b/inst/templates/upload_data_product.R @@ -0,0 +1,152 @@ +#' scottish deaths-involving-coronavirus-covid-19 +#' +#' This dataset presents the weekly, and year to date, provisional number of +#' deaths associated with coronavirus (COVID-19) alongside the total number +#' of deaths registered in Scotland, broken down by age, sex. (From: https://statistics.gov.scot/data/deaths-involving-coronavirus-covid-19) +#' + +library(SCRCdataAPI) +library(SCRCdata) + +key <- read.table("token.txt") + + +# initialise parameters --------------------------------------------------- + +product_name <- paste("human", "infection", "SARS-CoV-2", "scotland", + "mortality", sep = "/") + +todays_date <- as.POSIXct("2020-07-16 11:30:00", + format = "%Y-%m-%d %H:%M:%S") +version <- 0 +doi_or_unique_name <- "scottish scottish deaths-involving-coronavirus-covid-19" + +# where was the source data download from? (original source) +source_name <- "Scottish Government Open Data Repository" +original_root <- "https://statistics.gov.scot/sparql.csv?query=" +original_path <- "PREFIX qb: +PREFIX data: +PREFIX rdfs: +PREFIX dim: +PREFIX sdim: +PREFIX stat: +PREFIX mp: +SELECT ?featurecode ?featurename ?areatypename ?date ?cause ?location ?gender ?age ?type ?count +WHERE { + ?indicator qb:dataSet data:deaths-involving-coronavirus-covid-19; + mp:count ?count; + qb:measureType ?measType; + sdim:age ?value; + sdim:causeofdeath ?causeDeath; + sdim:locationofdeath ?locDeath; + sdim:sex ?sex; + dim:refArea ?featurecode; + dim:refPeriod ?period. + + ?measType rdfs:label ?type. + ?value rdfs:label ?age. + ?causeDeath rdfs:label ?cause. + ?locDeath rdfs:label ?location. + ?sex rdfs:label ?gender. + ?featurecode stat:code ?areatype; + rdfs:label ?featurename. + ?areatype rdfs:label ?areatypename. + ?period rdfs:label ?date. +}" + +# where is the processing script stored? +repo_storageRoot <- "github" +script_gitRepo <- "ScottishCovidResponse/SCRCdata" +repo_version <- "0.1.0" + + + +# Additional parameters --------------------------------------------------- +# These parameters are automatically generated and assume the following: +# (1) you intend to download your source data now +# (2) your source data will be automatically downloaded to data-raw/[product_name] +# (3) your source data filename will be [version_number].csv +# (4) you intend to process this data and generate a data product now +# (5) your data product will be automatically saved to data-raw/[product_name] +# (6) your data product filename will be [version_number].csv +# (7) + +namespace <- "SCRC" + +# when was the source data downloaded? +source_downloadDate <- todays_date + +# when was the data product generated? +script_processingDate <- todays_date + +# create version number (this is used to generate the *.csv and *.h5 filenames) +tmp <- as.Date(todays_date, format = "%Y-%m-%d") +version_number <- paste(gsub("-", "", tmp), version , sep = ".") + +# where is the source data downloaded to? (locally, before being stored) +local_path <- file.path("data-raw", product_name) +source_filename <- paste0(version_number, ".csv") + +# where is the data product saved? (locally, before being stored) +processed_path <- file.path("data-raw", product_name) +product_filename <- paste0(version_number, ".h5") + + + +# where is the source data stored? +source_storageRoot <- "boydorr" +source_path <- file.path(product_name, source_filename) + +# where is the submission script stored? +script_storageRoot <- "text_file" +submission_text <- "R -f inst/scripts/scotgov_deaths.R" + +# where is the data product stored? +product_storageRoot <- "boydorr" +product_path <- file.path(product_name, product_filename) + + + +# download source data ---------------------------------------------------- + +download_from_database(source_root = original_root, + source_path = original_path, + filename = source_filename, + path = local_path) + + + +# generate data product --------------------------------------------------- + +process_scotgov_deaths( + sourcefile = file.path(local_path, source_filename), + filename = file.path(local_path, product_filename)) + + + +# default data that should be in database --------------------------------- + +# data product storage root +product_storageRootId <- new_storage_root(name = product_storageRoot, + root = "ftp://boydorr.gla.ac.uk/scrc/", + key = key) + +# namespace +namespaceId <- new_namespace(name = namespace, + key = key) + + + + +# upload data product metadata to the registry ---------------------------- + +dataProductURIs <- upload_data_product( + storage_root_id = product_storageRootId, + name = product_name, + processed_path = file.path(processed_path, product_filename), + product_path = paste(product_path, product_filename, sep = "/"), + version = version_number, + namespace_id = namespaceId, + key = key) + + diff --git a/inst/templates/upload_dataset.R b/inst/templates/upload_dataset.R new file mode 100644 index 0000000..b5a988c --- /dev/null +++ b/inst/templates/upload_dataset.R @@ -0,0 +1,235 @@ +#' dataset-name +#' +#' Dataset description and link to source +#' + +library(SCRCdataAPI) +library(SCRCdata) + + +# Download a key from https://data.scrc.uk and store it somewhere safe! +key <- read.table("token.txt") + + +# The product_name is used to identify the data product and will be used to +# generate various file locations: +# (1) source data is downloaded locally to data-raw/[product_name] +# (2) source data is stored on the Boydorr server at +# ../../srv/ftp/scrc/[product_name] +# (3) data product is saved locally (after processing) to data-raw/[product_name] +# (4) data product is stored on the Boydorr server at +# ../../srv/ftp/scrc/[product_name] +product_name <- paste("human", "infection", "SARS-CoV-2", "scotland", + "cases_and_management", sep = "/") + +# The following information is used to generate the source data and data +# product filenames, e.g. 20200716.0.csv and 20200716.0.h5 +todays_date <- Sys.time() +version <- 0.0 + +# This is the name of your dataset +doi_or_unique_name <- "scottish coronavirus-covid-19-management-information" + +# Where was the source data download from? (original source) +# The source_name is the name associated with to the original_root +source_name <- "Scottish Government Open Data Repository" +original_root <- "https://statistics.gov.scot/sparql.csv?query=" +# Here, the original_path is a query (which is later converted into a path +# on line 164), if you have a url, you can use download_from_url() instead +original_path <- "PREFIX qb: +PREFIX data: +PREFIX rdfs: +PREFIX mp: +PREFIX dim: +PREFIX sdim: +PREFIX stat: +SELECT ?featurecode ?featurename ?date ?measure ?variable ?count +WHERE { + ?indicator qb:dataSet data:coronavirus-covid-19-management-information; + dim:refArea ?featurecode; + dim:refPeriod ?period; + sdim:variable ?varname; + qb:measureType ?type. +{?indicator mp:count ?count.} UNION {?indicator mp:ratio ?count.} + + ?featurecode ?featurename. + ?period rdfs:label ?date. + ?varname rdfs:label ?variable. + ?type rdfs:label ?measure. +}" + +# where is the processing script stored? +repo_storageRoot <- "github" +script_gitRepo <- "ScottishCovidResponse/SCRCdata" +repo_version <- "0.1.0" +processing_script <- "scotgov_management.R" + +# Now go to line 164 and check whether you want to use download_from_database() +# or download_from_url() + +# Insert your processing script function on line 189 + +# Additional parameters --------------------------------------------------- +# The following parameters are automatically generated and assume the following: +# (1) you intend to download your source data now +# (2) you intend to process this data and generate a data product now +# (3) your source data will be automatically downloaded to data-raw/[product_name] +# (4) your source data filename will be [version_number].csv +# (5) your data product will be automatically saved to data-raw/[product_name] +# (6) your data product filename will be [version_number].csv +# (7) you will upload your source data to the Boydorr server +# (8) you will upload your data product to the Boydorr server + +namespace <- "SCRC" + +# when was the source data downloaded? +source_downloadDate <- todays_date + +# when was the data product generated? +script_processingDate <- todays_date + +# create version number (this is used to generate the *.csv and *.h5 filenames) +tmp <- as.Date(todays_date, format = "%Y-%m-%d") +version_number <- paste(gsub("-", "", tmp), version , sep = ".") + +# where is the source data downloaded to? (locally, before being stored) +local_path <- file.path("data-raw", product_name) +source_filename <- paste0(version_number, ".csv") + +# where is the data product saved? (locally, before being stored) +processed_path <- file.path("data-raw", product_name) +product_filename <- paste0(version_number, ".h5") + + + +# where is the source data stored? +source_storageRoot <- "boydorr" +source_path <- file.path(product_name, source_filename) + +# where is the submission script stored? +script_storageRoot <- "text_file" +submission_text <- paste0("R -f inst/scripts/", processing_script) + +# where is the data product stored? +product_storageRoot <- "boydorr" +product_path <- file.path(product_name, product_filename) + + +# default data that should be in database --------------------------------- + +# original source name +original_sourceId <- new_source( + name = source_name, + abbreviation = "Scottish Government Open Data Repository", + website = "https://statistics.gov.scot/", + key = key) + +# original source root +original_storageRootId <- new_storage_root( + name = "Scottish Government Open Data Repository", + root = original_root, + key = key) + +# source data storage root +source_storageRootId <- new_storage_root(name = source_storageRoot, + root = "ftp://boydorr.gla.ac.uk/scrc/", + key = key) + +# submission script storage root +script_storageRootId <- new_storage_root(name = script_storageRoot, + root = "https://data.scrc.uk/api/text_file/", + key = key) +tmp <- gsub("^.*/([0-9]+)/$", "\\1", script_storageRootId) +script_path <- paste0(tmp, "/?format=text") + +# data product storage root +product_storageRootId <- new_storage_root(name = product_storageRoot, + root = "ftp://boydorr.gla.ac.uk/scrc/", + key = key) + +# github repo storage root +repo_storageRootId <- new_storage_root(name = repo_storageRoot, + root = "https://github.com", + key = key) + +# namespace +namespaceId <- new_namespace(name = namespace, + key = key) + + + +# download source data ---------------------------------------------------- + +download_from_database(source_root = original_root, + source_path = original_path, + filename = source_filename, + path = local_path) + + + +# upload source metadata to registry -------------------------------------- + +sourceDataURIs <- upload_source_data( + doi_or_unique_name = doi_or_unique_name, + original_source_id = original_sourceId, + original_root_id = original_storageRootId, + original_path = original_path, + local_path = file.path(local_path, source_filename), + storage_root_id = source_storageRootId, + target_path = paste(product_name, source_filename, sep = "/"), + download_date = source_downloadDate, + version = version, + key = key) + + + +# generate data product --------------------------------------------------- + +process_scotgov_management( + sourcefile = file.path(local_path, source_filename), + filename = file.path(local_path, product_filename)) + + + +# upload data product metadata to the registry ---------------------------- + +dataProductURIs <- upload_data_product( + storage_root_id = product_storageRootId, + name = product_name, + processed_path = file.path(processed_path, product_filename), + product_path = paste(product_path, product_filename, sep = "/"), + version = version_number, + namespace_id = namespaceId, + key = key) + + + +# upload submission script metadata to the registry ----------------------- + +submissionScriptURIs <- upload_submission_script( + storage_root_id = script_storageRootId, + path = script_path, + hash = openssl::sha1(submission_text), + text = submission_text, + run_date = script_processingDate, + key = key) + + + +# link objects together --------------------------------------------------- + +githubRepoURIs <- upload_github_repo( + storage_root_id = script_storageRootId, + repo = script_gitRepo, + hash = get_github_hash(script_gitRepo), + version = repo_version, + key = key) + +upload_object_links(run_date = script_processingDate, + run_identifier = paste("Script run to upload and process", + doi_or_unique_name), + code_repo_id = githubRepoURIs$repo_objectId, + submission_script_id = submissionScriptURIs$script_objectId, + inputs = list(sourceDataURIs$source_objectComponentId), + outputs = dataProductURIs$product_objectComponentId, + key = key)