-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextract-npp.R
253 lines (199 loc) · 9.5 KB
/
extract-npp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
## ------------------------------------------------------- ##
# Silica WG - Extract Spatial Data - NPP
## ------------------------------------------------------- ##
# Written by:
## Angel Chen, Nick Lyon
# Purpose:
## Using the watershed shapefiles created in "wrangle-watersheds.R"
## Extract the following data: NPP
## ------------------------------------------------------- ##
# Housekeeping ----
## ------------------------------------------------------- ##
# Load needed libraries
# install.packages("librarian")
librarian::shelf(tidyverse, sf, stars, terra, exactextractr, NCEAS/scicomptools,
googledrive, readxl)
# Clear environment
rm(list = ls())
# Silence `summarize`
options(dplyr.summarise.inform = F)
# Identify path to location of shared data
(path <- scicomptools::wd_loc(local = F, remote_path = file.path('/', "home", "shares", "lter-si", "si-watershed-extract")))
# Load in site names with lat/longs
sites <- readxl::read_excel(path = file.path(path, "site-coordinates",
"silica-coords_RAW.xlsx")) %>%
## Pare down to minimum needed columns
dplyr::select(LTER, Stream_Name, Discharge_File_Name, Shapefile_Name) %>%
## Drop duplicate rows (if any)
dplyr::distinct()
# Remove any watersheds without a shapefile
# dplyr::filter(!is.na(Shapefile_Name) &
# nchar(Shapefile_Name) != 0 &
# !Shapefile_Name %in% c("?", "MISSING"))
# Check it out
dplyr::glimpse(sites)
# Grab the shapefiles the previous script (see PURPOSE section) created
sheds <- sf::st_read(dsn = file.path(path, "site-coordinates", "silica-watersheds.shp")) %>%
# Expand names to what they were before
dplyr::rename(Shapefile_Name = shp_nm,
Stream_Name = Strm_Nm,
expert_area_km2 = exp_area,
shape_area_km2 = real_area)
## combine sites and sheds to get ALL sheds (including hydrosheds) and their metadata (from the sites dataframe)
sheds <- sheds %>%
dplyr::left_join(y = sites, by = c("LTER", "Shapefile_Name"))
sheds$Stream_Name <- ifelse(!is.na(sheds$Stream_Name.x), sheds$Stream_Name.x, sheds$Stream_Name.y)
sheds$Discharge_File_Name <- ifelse(!is.na(sheds$Dsc_F_N), sheds$Dsc_F_N, sheds$Discharge_File_Name)
sheds <- sheds %>% select (-c(Stream_Name.x, Stream_Name.y, expert_area_km2, shape_area_km2, exp_are, hydrshd, real_ar,
Dsc_F_N))
# Check that out
dplyr::glimpse(sheds)
# Clean up environment
rm(list = setdiff(ls(), c('path', 'sites', 'sheds')))
## ------------------------------------------------------- ##
# MOD17A3HGF -v061 - Identify Files ----
## ------------------------------------------------------- ##
# Make an empty list
file_list <- list()
## NEW SITES added for Data Release 2 ##
for(region in c("north-america-usa", "north-america-arctic",
"cropped-russia-west", "cropped-russia-west-2",
"cropped-russia-center", "cropped-russia-east",
"puerto-rico", "scandinavia",
"amazon", "australia",
"canada", "congo",
"germany", "united-kingdom")){
# This part is new -- we want to allow old and new versions of MODIS
# Identify files in that folder
file_df <- data.frame("region" = region,
"files" = dir(path = file.path(path, "raw-driver-data",
"raw-npp-v061", region))) %>%
dplyr::filter(stringr::str_detect(string=files, pattern="MOD17A3HGF.061_Npp_500m"))
# Add that set of files to the list
file_list[[region]] <- file_df }
# Wrangle the list
file_all <- file_list %>%
# Unlist the loop's output
purrr::list_rbind() %>%
# Identify date from file name
dplyr::mutate(date_raw = stringr::str_extract(string = files,
pattern = "_doy[[:digit:]]{7}")) %>%
# Simplify that column
dplyr::mutate(date = gsub(pattern = "_doy", replacement = "", x = date_raw)) %>%
# Identify year
dplyr::mutate(year = stringr::str_sub(string = date, start = 1, end = 4)) %>%
# Drop 'raw' version
dplyr::select(-date_raw)
# Glimpse it
dplyr::glimpse(file_all)
# Clean up environment
rm(list = setdiff(ls(), c('path', 'sites', 'sheds', 'file_all')))
## ------------------------------------------------------- ##
# NPP - Extract ----
## ------------------------------------------------------- ##
# Specify driver
focal_driver <- "raw-npp-v061"
# Identify files we've already extracted from
done_files <- data.frame("files" = dir(file.path(path, "raw-driver-data",
focal_driver,
"_partial-extracted"))) %>%
tidyr::separate(col = files, remove = F,
into = c("junk", "junk2", "year", "file_ext"))
# Remove completed files from the set of all possible files
not_done <- file_all %>%
dplyr::filter(!year %in% done_files$year)
# Create a definitive object of files to extract
#file_set <- not_done # Uncomment if want to only do only undone extractions
file_set <- file_all # Uncomment if want to do all extractions
# Loop across years...
for (a_year in unique(file_set$year)){
# Starting message
message("Processing begun for year ", a_year)
# Subset to one year
one_year_data <- dplyr::filter(file_set, year == a_year)
# Make a list to house extracted information for a year
year_list <- list()
for (i in 1:nrow(one_year_data)){
# Read in the raster
npp_raster <- terra::rast(file.path(path, "raw-driver-data", "raw-npp-v061", one_year_data$region[i], one_year_data$files[i]))
# Extract all possible information from that dataframe
ex_data <- exactextractr::exact_extract(x = npp_raster, y = sheds,
include_cols = c("LTER", "Shapefile_Name"),
progress = FALSE) %>%
# Unlist to dataframe
purrr::list_rbind() %>%
# Drop coverage fraction column
dplyr::select(-coverage_fraction) %>%
# Drop NA values that were "extracted"
## I.e., those that are outside of the current raster bounding nox
dplyr::filter(!is.na(value)) %>%
# Drop invalid values (per product documentation page)
dplyr::filter(value >= -30000 & value <= 32700) %>%
# Make new relevant columns
dplyr::mutate(year = a_year,
.after = Shapefile_Name)
# Add this dataframe to the list we made
year_list[[i]] <- ex_data
# End message
message("Finished extracting raster ", i, " of ", nrow(one_year_data), " (", a_year, ")")
}
# Assemble a file name for this extraction
export_name <- paste0("npp_extract_", a_year, ".csv")
# Wrangle the output of the within-year extraction
full_data <- year_list %>%
# Unlist to dataframe
purrr::list_rbind() %>%
# Apply scaler value
## See "Layers" dropdown here: https://lpdaac.usgs.gov/products/mod17a3hgfv061/
# dplyr::mutate(descaled_val = value * 0.0001) %>%
# Handle the summarization within river (potentially across multiple rasters' pixels)
dplyr::group_by(LTER, Shapefile_Name, year) %>%
#dplyr::summarize(npp = mean(descaled_val, na.rm = T)) %>% ## This is mean per pixel
dplyr::summarize(npp = mean(value, na.rm = T)) %>% ## This is mean per pixel -- checking to see if data are already scaled
dplyr::ungroup() %>%
# Drop unnecessary columns
dplyr::select(-year) %>%
dplyr::rename_with(.fn = ~paste0("npp_", a_year, "_kgC_m2_year"), .cols = npp)
# Export this file for a given year
write.csv(x = full_data, row.names = F, na = '',
file = file.path(path, "raw-driver-data", "raw-npp-v061",
"_partial-extracted", export_name))
# End message
message("Finished wrangling output for ", a_year) }
## ------------------------------------------------------- ##
# NPP - Export ----
## ------------------------------------------------------- ##
# Identify extracted data
done_files <- dir(file.path(path, "raw-driver-data", focal_driver, "_partial-extracted"))
# Make an empty list
full_out <- list()
# Read all of these files in
for(k in 1:length(done_files)){
# Read in the kth file
full_out[[k]] <- read.csv(file = file.path(path, "raw-driver-data", focal_driver,
"_partial-extracted", done_files[k]))
# Finish
message("Retrieved file ", k, " of ", length(done_files))}
# Unlist that list
out_df <- full_out %>%
purrr::reduce(dplyr::left_join, by = c("LTER", "Shapefile_Name"))
# Glimpse it
dplyr::glimpse(out_df)
# Let's get ready to export
npp_export <- sheds %>%
# Join the npp data
dplyr::left_join(y = out_df, by = c("LTER", "Shapefile_Name"))%>%
# this drops the geometry column, which causes issues on export
sf::st_drop_geometry()
# Check it out
dplyr::glimpse(npp_export)
# Create folder to export to
dir.create(path = file.path(path, "extracted-data"), showWarnings = F)
# Export the summarized npp data
write.csv(x = npp_export, na = '', row.names = F,
file = file.path(path, "extracted-data", "si-extract_npp_2_v061.csv"))
# Upload to GoogleDrive
googledrive::drive_upload(media = file.path(path, "extracted-data", "si-extract_npp_2_v061.csv"),
overwrite = T,
path = googledrive::as_id("https://drive.google.com/drive/u/0/folders/1FBq2-FW6JikgIuGVMX5eyFRB6Axe2Hld"))
# End ----