-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_WebScraper.R
140 lines (112 loc) · 3.76 KB
/
01_WebScraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
library(httr)
library(rvest)
library(Rcrawler)
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
rm(list=ls())
# set wd to where the source file is
# make sure you have the datafiles in a /data/ folder
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# initiate empty list to keep track of crawler history (visited pages)
crawl_hist <- matrix(ncol = 2, nrow = 0)
colnames(crawl_hist) <- c("visited_URL", "found_URLs")
# initiate control variable for frontier
frontier_not_empty <- TRUE
# initiate counter variable
counter <- 1
frontier = {}
# set n low to test the code
# else set it to the maximum amount of pages: 609
n = 609
for (i in 1:n){
# DEQUEUEING AND FETCHING OF WEBPAGE ----- # dequeue URL from frontier
current_URL <- paste0('https://www.presidency.ucsb.edu/documents/app-categories/written-presidential-orders/presidential/executive-orders?page=', i)
# fetch page via HTTP
resp <- GET(current_URL)
# parse HTML
html_doc <- read_html(resp)
# LINK EXTRACTION -----
# extract all URLs
links_nodes <- html_nodes(html_doc, xpath = ".//a")
URLs <- html_attr(links_nodes, name = "href")
# clean URLs
# remove anchors
URLs <- URLs[!grepl("#", URLs, fixed = TRUE)]
# remove NAs
URLs <- na.omit(URLs)
# canonicalize URLs, only keep unique entries
URLs <- unique(LinkNormalization(links = URLs, current = current_URL)) # add to tail of frontier (first-in-first-out -> breadth first)
frontier[[i]] <- URLs
# DATA PART -----
# create log entry
log_df <- data.frame(visited_URL = current_URL, URLs_found = URLs) # add to crawler history
crawl_hist <- rbind(crawl_hist, as.matrix(log_df))
# CONTROL PART ----
# update control variables
frontier_not_empty <- length(frontier) != 0
# display status on screen:
cat(paste0("\n", "Page no. ", counter, " visited!\n", current_URL))
# update counter
counter <- counter + 1
}
# transform to dataframe
df <- data.frame(unlist(frontier))
colnames(df) <- "urls"
filtered.urls <- df %>%filter(str_detect(urls, 'https://www.presidency.ucsb.edu/documents/executive-order'))
filtered.urls <- data.frame(unique((filtered.urls)))
# save the urls just in case something goes wrong after this
fwrite(filtered.urls, './data/filtered_urls.csv')
# set up lists for extraction
url.list = {}
body.list = {}
h1.list = {}
h2.list = {}
h3.list = {}
date.list = {}
fail.list = {}
# failure at 1716
# web scrapting of texts
for (i in 1:nrow(filtered.urls)){
tryCatch({
# read html
url = filtered.urls$urls[i]
html <- read_html(url)
# add url
url.list[[i]] = url
# read text body
body.list[[i]] <- html %>%
html_nodes("p") %>%
html_text() %>%
toString()
# get titles
h1.list[[i]] <- html %>% html_nodes("h1") %>% html_text() %>% toString
h2.list[[i]] <- html %>% html_nodes("h2") %>% html_text() %>% toString
h3.list[[i]] <- html %>% html_nodes("h3") %>% html_text() %>% toString
# get date
date.list[[i]] <- toString(html %>% html_nodes("div.field-docs-start-date-time") %>% html_text())
# print progress
print(paste0('success link nr. ', i))
},
error = function(cond) {
message(paste("URL does not seem to exist:", url))
fail.list[[i]] <- url
message("Here's the original error message:")
message(cond)
# Choose a return value in case of error
return(NA)
}
)
}
# merge all lists into a dataframe
df <- do.call(rbind, Map(data.frame,
URL = url.list,
H1= h1.list,
H2 = h2.list,
H3 = h3.list,
body = body.list,
date = date.list))
# save dataframe
dir.create('./data/')
fwrite(df, './data/executive_orders.csv')